Thrift-1023:Thrift encoding (UTF-8) issue with Ruby 1.9.2 Client: rb Patch: Nathan Beyer Fixes encoding issue for UTF-8 strings in ruby client. git-svn-id: https://svn.apache.org/repos/asf/thrift/trunk@1395832 13f79535-47bb-0310-9956-ffa450edef68

commit: b5a18a150ee73003ab760f85023e5fcb3625c8e5 [log] [tgz]
author: Jake Farrell <jfarrell@apache.org> Tue Oct 09 01:10:43 2012 +0000
committer: Jake Farrell <jfarrell@apache.org> Tue Oct 09 01:10:43 2012 +0000
tree: 7a721c8263485511291e94d877aef54f5b5c71ea
parent: fc35612d1dba14b47dadfed1c354f4d20be63e03 [diff]
diff --git a/lib/rb/ext/binary_protocol_accelerated.c b/lib/rb/ext/binary_protocol_accelerated.c
index bd1c2da..a8ebe7f 100644
--- a/lib/rb/ext/binary_protocol_accelerated.c
+++ b/lib/rb/ext/binary_protocol_accelerated.c

@@ -22,7 +22,8 @@
 #include <stdint.h>
 #include <constants.h>
 #include <struct.h>
-#include "macros.h"
+#include <macros.h>
+#include <bytes.h>
 
 VALUE rb_thrift_binary_proto_native_qmark(VALUE self) {
   return Qtrue;
@@ -80,6 +81,7 @@
   if (TYPE(str) != T_STRING) {
     rb_raise(rb_eStandardError, "Value should be a string");    
   }
+  str = convert_to_utf8_byte_buffer(str);
   write_i32_direct(trans, RSTRING_LEN(str));
   rb_funcall(trans, write_method_id, 1, str);
 }
@@ -380,7 +382,8 @@
 
 VALUE rb_thrift_binary_proto_read_string(VALUE self) {
   int size = read_i32_direct(self);
-  return READ(self, size);
+  VALUE buffer = READ(self, size);
+  return convert_to_string(buffer);
 }
 
 void Init_binary_protocol_accelerated() {

diff --git a/lib/rb/ext/bytes.c b/lib/rb/ext/bytes.c
new file mode 100644
index 0000000..8a6fac4
--- /dev/null
+++ b/lib/rb/ext/bytes.c

@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <ruby.h>
+#ifdef HAVE_RUBY_ENCODING_H
+#include <ruby/encoding.h>
+#endif
+#include <constants.h>
+
+VALUE force_binary_encoding(VALUE buffer) {
+  return rb_funcall(thrift_bytes_module, force_binary_encoding_id, 1, buffer);
+}
+
+VALUE convert_to_utf8_byte_buffer(VALUE string) {
+  return rb_funcall(thrift_bytes_module, convert_to_utf8_byte_buffer_id, 1, string);
+}
+
+VALUE convert_to_string(VALUE utf8_buffer) {
+  return rb_funcall(thrift_bytes_module, convert_to_string_id, 1, utf8_buffer);
+}

diff --git a/lib/rb/ext/bytes.h b/lib/rb/ext/bytes.h
new file mode 100644
index 0000000..7108d83
--- /dev/null
+++ b/lib/rb/ext/bytes.h

@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <ruby.h>
+
+/*
+ * A collection of utilities for working with bytes and byte buffers.
+ *
+ * These methods are the native analogies to some of the methods in
+ * Thrift::Bytes (thrift/bytes.rb).
+ */
+
+VALUE force_binary_encoding(VALUE buffer);
+VALUE convert_to_utf8_byte_buffer(VALUE string);
+VALUE convert_to_string(VALUE utf8_buffer);

diff --git a/lib/rb/ext/compact_protocol.c b/lib/rb/ext/compact_protocol.c
index a47fe6c..0c05481 100644
--- a/lib/rb/ext/compact_protocol.c
+++ b/lib/rb/ext/compact_protocol.c

@@ -20,9 +20,10 @@
 #include <ruby.h>
 #include <stdbool.h>
 #include <stdint.h>
-#include "constants.h"
-#include "struct.h"
-#include "macros.h"
+#include <constants.h>
+#include <struct.h>
+#include <macros.h>
+#include <bytes.h>
 
 #define LAST_ID(obj) FIX2INT(rb_ary_pop(rb_ivar_get(obj, last_field_id)))
 #define SET_LAST_ID(obj, val) rb_ary_push(rb_ivar_get(obj, last_field_id), val)
@@ -305,6 +306,7 @@
 
 VALUE rb_thrift_compact_proto_write_string(VALUE self, VALUE str) {
   VALUE transport = GET_TRANSPORT(self);
+  str = convert_to_utf8_byte_buffer(str);
   write_varint32(transport, RSTRING_LEN(str));
   WRITE(transport, RSTRING_PTR(str), RSTRING_LEN(str));
   return Qnil;
@@ -546,7 +548,8 @@
 
 VALUE rb_thrift_compact_proto_read_string(VALUE self) {
   int64_t size = read_varint64(self);
-  return READ(self, size);
+  VALUE buffer = READ(self, size);
+  return convert_to_string(buffer);
 }
 
 static void Init_constants() {

diff --git a/lib/rb/ext/constants.h b/lib/rb/ext/constants.h
index 9ea00d2..3bfac88 100644
--- a/lib/rb/ext/constants.h
+++ b/lib/rb/ext/constants.h

@@ -76,6 +76,9 @@
 extern ID read_all_method_id;
 extern ID read_into_buffer_method_id;
 extern ID native_qmark_method_id;
+extern ID force_binary_encoding_id;
+extern ID convert_to_utf8_byte_buffer_id;
+extern ID convert_to_string_id;
 
 extern ID fields_const_id;
 extern ID transport_ivar_id;
@@ -92,5 +95,6 @@
 extern VALUE rb_cSet;
 extern VALUE thrift_module;
 extern VALUE thrift_types_module;
+extern VALUE thrift_bytes_module;
 extern VALUE class_thrift_protocol;
 extern VALUE protocol_exception_class;

diff --git a/lib/rb/ext/memory_buffer.c b/lib/rb/ext/memory_buffer.c
index 319b073..e7253dc 100644
--- a/lib/rb/ext/memory_buffer.c
+++ b/lib/rb/ext/memory_buffer.c

@@ -19,7 +19,8 @@
 
 #include <ruby.h>
 #include <constants.h>
-#include "macros.h"
+#include <bytes.h>
+#include <macros.h>
 
 ID buf_ivar_id;
 ID index_ivar_id;
@@ -37,6 +38,7 @@
 
 VALUE rb_thrift_memory_buffer_write(VALUE self, VALUE str) {
   VALUE buf = GET_BUF(self);
+  str = force_binary_encoding(str);
   rb_str_buf_cat(buf, RSTRING_PTR(str), RSTRING_LEN(str));
   return Qnil;
 }

diff --git a/lib/rb/ext/thrift_native.c b/lib/rb/ext/thrift_native.c
index 2f6bb1a..f066d6c 100644
--- a/lib/rb/ext/thrift_native.c
+++ b/lib/rb/ext/thrift_native.c

@@ -18,6 +18,7 @@
  */
 
 #include <ruby.h>
+#include <bytes.h>
 #include <struct.h>
 #include <binary_protocol_accelerated.h>
 #include <compact_protocol.h>
@@ -27,6 +28,7 @@
 // cached classes/modules
 VALUE rb_cSet;
 VALUE thrift_module;
+VALUE thrift_bytes_module;
 VALUE thrift_types_module;
 
 // TType constants
@@ -90,6 +92,9 @@
 ID read_all_method_id;
 ID read_into_buffer_method_id;
 ID native_qmark_method_id;
+ID force_binary_encoding_id;
+ID convert_to_utf8_byte_buffer_id;
+ID convert_to_string_id;
 
 // constant ids
 ID fields_const_id;
@@ -109,6 +114,7 @@
 void Init_thrift_native() {
   // cached classes
   thrift_module = rb_const_get(rb_cObject, rb_intern("Thrift"));
+  thrift_bytes_module = rb_const_get(thrift_module, rb_intern("Bytes"));
   thrift_types_module = rb_const_get(thrift_module, rb_intern("Types"));
   rb_cSet = rb_const_get(rb_cObject, rb_intern("Set"));
   protocol_exception_class = rb_const_get(thrift_module, rb_intern("ProtocolException"));
@@ -173,6 +179,9 @@
   read_all_method_id = rb_intern("read_all");
   read_into_buffer_method_id = rb_intern("read_into_buffer");
   native_qmark_method_id = rb_intern("native?");
+  force_binary_encoding_id = rb_intern("force_binary_encoding");
+  convert_to_utf8_byte_buffer_id = rb_intern("convert_to_utf8_byte_buffer");
+  convert_to_string_id = rb_intern("convert_to_string");
 
   // constant ids
   fields_const_id = rb_intern("FIELDS");

diff --git a/lib/rb/lib/thrift.rb b/lib/rb/lib/thrift.rb
index 72050b1..fb9e04a 100644
--- a/lib/rb/lib/thrift.rb
+++ b/lib/rb/lib/thrift.rb

@@ -22,6 +22,7 @@
 
 $:.unshift File.dirname(__FILE__)
 
+require 'thrift/bytes'
 require 'thrift/core_ext'
 require 'thrift/exceptions'
 require 'thrift/types'

diff --git a/lib/rb/lib/thrift/bytes.rb b/lib/rb/lib/thrift/bytes.rb
new file mode 100644
index 0000000..efd4f64
--- /dev/null
+++ b/lib/rb/lib/thrift/bytes.rb

@@ -0,0 +1,131 @@
+# encoding: ascii-8bit
+# 
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+module Thrift
+  # A collection of utilities for working with bytes and byte buffers.
+  module Bytes
+    if RUBY_VERSION >= '1.9'
+      # Creates and empty byte buffer (String with BINARY encoding)
+      #
+      # size - The Integer size of the buffer (default: nil) to create
+      #
+      # Returns a String with BINARY encoding, filled with null characters 
+      # if size is greater than zero
+      def self.empty_byte_buffer(size = nil)
+        if (size && size > 0)
+          "\0".force_encoding(Encoding::BINARY) * size
+        else
+          ''.force_encoding(Encoding::BINARY)
+        end
+      end
+
+      # Forces the encoding of the buffer to BINARY. If the buffer
+      # passed is frozen, then it will be duplicated.
+      #
+      # buffer - The String to force the encoding of.
+      #
+      # Returns the String passed with an encoding of BINARY; returned
+      # String may be a duplicate.
+      def self.force_binary_encoding(buffer)
+        buffer = buffer.dup if buffer.frozen?
+        buffer.force_encoding(Encoding::BINARY)
+      end
+
+      # Gets the byte value of a given position in a String.
+      #
+      # string - The String to retrive the byte value from.
+      # index  - The Integer location of the byte value to retrieve.
+      #
+      # Returns an Integer value between 0 and 255.
+      def self.get_string_byte(string, index)
+        string.getbyte(index)
+      end
+
+      # Sets the byte value given to a given index in a String.
+      #
+      # string - The String to set the byte value in.
+      # index  - The Integer location to set the byte value at.
+      # byte   - The Integer value (0 to 255) to set in the string.
+      #
+      # Returns an Integer value of the byte value to set.
+      def self.set_string_byte(string, index, byte)
+        string.setbyte(index, byte)
+      end
+
+      # Converts the given String to a UTF-8 byte buffer.
+      #
+      # string - The String to convert.
+      #
+      # Returns a new String with BINARY encoding, containing the UTF-8
+      # bytes of the original string.
+      def self.convert_to_utf8_byte_buffer(string)
+        if string.encoding != Encoding::UTF_8
+          # transcode to UTF-8
+          string = string.encode(Encoding::UTF_8)
+        else
+          # encoding is already UTF-8, but a duplicate is needed
+          string = string.dup
+        end
+        string.force_encoding(Encoding::BINARY)
+      end
+
+      # Converts the given UTF-8 byte buffer into a String
+      #
+      # utf8_buffer - A String, with BINARY encoding, containing UTF-8 bytes
+      #
+      # Returns a new String with UTF-8 encoding,
+      def self.convert_to_string(utf8_buffer)
+        # duplicate the buffer, force encoding to UTF-8
+        utf8_buffer.dup.force_encoding(Encoding::UTF_8)
+      end
+    else
+      def self.empty_byte_buffer(size = nil)
+        if (size && size > 0)
+          "\0" * size
+        else
+          ''
+        end
+      end
+
+      def self.force_binary_encoding(buffer)
+        buffer
+      end
+
+      def self.get_string_byte(string, index)
+        string[index]
+      end
+
+      def self.set_string_byte(string, index, byte)
+        string[index] = byte
+      end
+
+      def self.convert_to_utf8_byte_buffer(string)
+        # This assumes $KCODE is 'UTF8'/'U', which would mean the String is already a UTF-8 byte buffer
+        # TODO consider handling other $KCODE values and transcoding with iconv
+        string
+      end
+
+      def self.convert_to_string(utf8_buffer)
+        # See comment in 'convert_to_utf8_byte_buffer' for relevant assumptions.
+        utf8_buffer
+      end
+    end
+  end
+end

diff --git a/lib/rb/lib/thrift/protocol/base_protocol.rb b/lib/rb/lib/thrift/protocol/base_protocol.rb
index b19909d..a5a174d 100644
--- a/lib/rb/lib/thrift/protocol/base_protocol.rb
+++ b/lib/rb/lib/thrift/protocol/base_protocol.rb

@@ -114,6 +114,13 @@
       raise NotImplementedError
     end
 
+    # Writes a Thrift String. In Ruby 1.9+, the String passed will be transcoded to UTF-8.
+    #
+    # str - The String to write.
+    #
+    # Raises EncodingError if the transcoding to UTF-8 fails.
+    #
+    # Returns nothing.
     def write_string(str)
       raise NotImplementedError
     end
@@ -178,6 +185,9 @@
       raise NotImplementedError
     end
 
+    # Reads a Thrift String. In Ruby 1.9+, all String will be returned with an Encoding of UTF-8.
+    #
+    # Returns a String.
     def read_string
       raise NotImplementedError
     end

diff --git a/lib/rb/lib/thrift/protocol/binary_protocol.rb b/lib/rb/lib/thrift/protocol/binary_protocol.rb
index f9adb20..2528276 100644
--- a/lib/rb/lib/thrift/protocol/binary_protocol.rb
+++ b/lib/rb/lib/thrift/protocol/binary_protocol.rb

@@ -32,8 +32,7 @@
 
       # Pre-allocated read buffer for fixed-size read methods. Needs to be at least 8 bytes long for
       # read_i64() and read_double().
-      @rbuf = "\0" * 8
-      @rbuf.force_encoding("BINARY") if @rbuf.respond_to?(:force_encoding)
+      @rbuf = Bytes.empty_byte_buffer(8)
     end
 
     def write_message_begin(name, type, seqid)
@@ -108,6 +107,7 @@
     end
 
     def write_string(str)
+      str = Bytes.convert_to_utf8_byte_buffer(str)
       write_i32(str.length)
       trans.write(str)
     end
@@ -214,9 +214,9 @@
     end
 
     def read_string
-      sz = read_i32
-      dat = trans.read_all(sz)
-      dat
+      size = read_i32
+      buffer = trans.read_all(size)
+      Bytes.convert_to_string(buffer)
     end
 
   end

diff --git a/lib/rb/lib/thrift/protocol/compact_protocol.rb b/lib/rb/lib/thrift/protocol/compact_protocol.rb
index ede82f2..758e1ae 100644
--- a/lib/rb/lib/thrift/protocol/compact_protocol.rb
+++ b/lib/rb/lib/thrift/protocol/compact_protocol.rb

@@ -100,8 +100,7 @@
       @boolean_value = nil
 
       # Pre-allocated read buffer for read_double().
-      @rbuf = "\0" * 8
-      @rbuf.force_encoding("BINARY") if @rbuf.respond_to?(:force_encoding)
+      @rbuf = Bytes.empty_byte_buffer(8)
     end
 
     def write_message_begin(name, type, seqid)
@@ -211,6 +210,7 @@
     end
 
     def write_string(str)
+      str = Bytes.convert_to_utf8_byte_buffer(str)
       write_varint32(str.length)
       @trans.write(str)
     end
@@ -333,7 +333,8 @@
 
     def read_string
       size = read_varint32()
-      trans.read_all(size)
+      buffer = trans.read_all(size)
+      Bytes.convert_to_string(buffer)
     end
     
     

diff --git a/lib/rb/lib/thrift/protocol/json_protocol.rb b/lib/rb/lib/thrift/protocol/json_protocol.rb
index ddbf193..6f8d1d1 100644
--- a/lib/rb/lib/thrift/protocol/json_protocol.rb
+++ b/lib/rb/lib/thrift/protocol/json_protocol.rb

@@ -1,3 +1,4 @@
+# encoding: UTF-8
 # 
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements. See the NOTICE file
@@ -482,13 +483,21 @@
     end
 
     # Decodes the four hex parts of a JSON escaped string character and returns
-    # the character via out. The first two characters must be "00".
+    # the character via out.
+    #
+    # Note - this only supports Unicode characters in the BMP (U+0000 to U+FFFF);
+    # characters above the BMP are encoded as two escape sequences (surrogate pairs),
+    # which is not yet implemented
     def read_json_escape_char
-      read_json_syntax_char('0')
-      read_json_syntax_char('0')
       str = @reader.read
       str += @reader.read
-      str.hex.chr
+      str += @reader.read
+      str += @reader.read
+      if RUBY_VERSION >= '1.9'
+        str.hex.chr(Encoding::UTF_8)
+      else
+        str.hex.chr
+      end
     end
 
     # Decodes a JSON string, including unescaping, and returns the string via str

diff --git a/lib/rb/lib/thrift/transport/base_transport.rb b/lib/rb/lib/thrift/transport/base_transport.rb
index 0a12cea..8790326 100644
--- a/lib/rb/lib/thrift/transport/base_transport.rb
+++ b/lib/rb/lib/thrift/transport/base_transport.rb

@@ -35,22 +35,14 @@
   end
 
   module TransportUtils
-    if RUBY_VERSION >= '1.9'
-      def self.get_string_byte(string, index)
-        string.getbyte(index)
-      end
+    # Deprecated: Use Thrift::Bytes instead
+    def self.get_string_byte(string, index)
+      Bytes.get_string_byte(string, index)
+    end
 
-      def self.set_string_byte(string, index, byte)
-        string.setbyte(index, byte)
-      end
-    else
-      def self.get_string_byte(string, index)
-        string[index]
-      end
-
-      def self.set_string_byte(string, index, byte)
-        string[index] = byte
-      end
+    # Deprecated: Use Thrift::Bytes instead
+    def self.set_string_byte(string, index, byte)
+      Bytes.set_string_byte(string, index, byte)
     end
   end
 
@@ -61,6 +53,11 @@
 
     def close; end
 
+    # Reads a number of bytes from the transports. In Ruby 1.9+, the String returned will have a BINARY (aka ASCII8BIT) encoding.
+    #
+    # sz - The number of bytes to read from the transport.
+    #
+    # Returns a String acting as a byte buffer.
     def read(sz)
       raise NotImplementedError
     end
@@ -68,7 +65,7 @@
     # Returns an unsigned byte as a Fixnum in the range (0..255).
     def read_byte
       buf = read_all(1)
-      return ::Thrift::TransportUtils.get_string_byte(buf, 0)
+      return Bytes.get_string_byte(buf, 0)
     end
 
     # Reads size bytes and copies them into buffer[0..size].
@@ -76,14 +73,14 @@
       tmp = read_all(size)
       i = 0
       tmp.each_byte do |byte|
-        ::Thrift::TransportUtils.set_string_byte(buffer, i, byte)
+        Bytes.set_string_byte(buffer, i, byte)
         i += 1
       end
       i
     end
 
     def read_all(size)
-      return '' if size <= 0
+      return Bytes.empty_byte_buffer if size <= 0
       buf = read(size)
       while (buf.length < size)
         chunk = read(size - buf.length)
@@ -92,7 +89,12 @@
     
       buf
     end
-  
+
+    # Writes the byte buffer to the transport. In Ruby 1.9+, the buffer will be forced into BINARY encoding.
+    #
+    # buf - A String acting as a byte buffer.
+    #
+    # Returns nothing.
     def write(buf); end
     alias_method :<<, :write
 
@@ -104,4 +106,4 @@
       return trans
     end
   end
-end
\ No newline at end of file
+end

diff --git a/lib/rb/lib/thrift/transport/buffered_transport.rb b/lib/rb/lib/thrift/transport/buffered_transport.rb
index 676a4d3..781d3c6 100644
--- a/lib/rb/lib/thrift/transport/buffered_transport.rb
+++ b/lib/rb/lib/thrift/transport/buffered_transport.rb

@@ -24,8 +24,8 @@
     
     def initialize(transport)
       @transport = transport
-      @wbuf = ''
-      @rbuf = ''
+      @wbuf = Bytes.empty_byte_buffer
+      @rbuf = Bytes.empty_byte_buffer
       @index = 0
     end
 
@@ -44,12 +44,12 @@
 
     def read(sz)
       @index += sz
-      ret = @rbuf.slice(@index - sz, sz) || ''
+      ret = @rbuf.slice(@index - sz, sz) || Bytes.empty_byte_buffer
 
       if ret.length == 0
         @rbuf = @transport.read([sz, DEFAULT_BUFFER].max)
         @index = sz
-        ret = @rbuf.slice(0, sz) || ''
+        ret = @rbuf.slice(0, sz) || Bytes.empty_byte_buffer
       end
 
       ret
@@ -65,9 +65,15 @@
       # The read buffer has some data now, read a single byte. Using get_string_byte() avoids
       # allocating a temp string of size 1 unnecessarily.
       @index += 1
-      return ::Thrift::TransportUtils.get_string_byte(@rbuf, @index - 1)
+      return Bytes.get_string_byte(@rbuf, @index - 1)
     end
 
+    # Reads a number of bytes from the transport into the buffer passed.
+    #
+    # buffer - The String (byte buffer) to write data to; this is assumed to have a BINARY encoding.
+    # size   - The number of bytes to read from the transport and write to the buffer.
+    #
+    # Returns the number of bytes read.
     def read_into_buffer(buffer, size)
       i = 0
       while i < size
@@ -78,8 +84,8 @@
         end
 
         # The read buffer has some data now, so copy bytes over to the output buffer.
-        byte = ::Thrift::TransportUtils.get_string_byte(@rbuf, @index)
-        ::Thrift::TransportUtils.set_string_byte(buffer, i, byte)
+        byte = Bytes.get_string_byte(@rbuf, @index)
+        Bytes.set_string_byte(buffer, i, byte)
         @index += 1
         i += 1
       end
@@ -87,13 +93,13 @@
     end
 
     def write(buf)
-      @wbuf << buf
+      @wbuf << Bytes.force_binary_encoding(buf)
     end
 
     def flush
-      if @wbuf != ''
+      unless @wbuf.empty?
         @transport.write(@wbuf)
-        @wbuf = ''
+        @wbuf = Bytes.empty_byte_buffer
       end
       
       @transport.flush

diff --git a/lib/rb/lib/thrift/transport/framed_transport.rb b/lib/rb/lib/thrift/transport/framed_transport.rb
index e7630d0..d806ce0 100644
--- a/lib/rb/lib/thrift/transport/framed_transport.rb
+++ b/lib/rb/lib/thrift/transport/framed_transport.rb

@@ -22,8 +22,8 @@
   class FramedTransport < BaseTransport
     def initialize(transport, read=true, write=true)
       @transport = transport
-      @rbuf      = ''
-      @wbuf      = ''
+      @rbuf      = Bytes.empty_byte_buffer
+      @wbuf      = Bytes.empty_byte_buffer
       @read      = read
       @write     = write
       @index      = 0
@@ -44,12 +44,12 @@
     def read(sz)
       return @transport.read(sz) unless @read
 
-      return '' if sz <= 0
+      return Bytes.empty_byte_buffer if sz <= 0
 
       read_frame if @index >= @rbuf.length
 
       @index += sz
-      @rbuf.slice(@index - sz, sz) || ''
+      @rbuf.slice(@index - sz, sz) || Bytes.empty_byte_buffer
     end
 
     def read_byte
@@ -60,7 +60,7 @@
       # The read buffer has some data now, read a single byte. Using get_string_byte() avoids
       # allocating a temp string of size 1 unnecessarily.
       @index += 1
-      return ::Thrift::TransportUtils.get_string_byte(@rbuf, @index - 1)
+      return Bytes.get_string_byte(@rbuf, @index - 1)
     end
 
     def read_into_buffer(buffer, size)
@@ -69,18 +69,18 @@
         read_frame if @index >= @rbuf.length
 
         # The read buffer has some data now, so copy bytes over to the output buffer.
-        byte = ::Thrift::TransportUtils.get_string_byte(@rbuf, @index)
-        ::Thrift::TransportUtils.set_string_byte(buffer, i, byte)
+        byte = Bytes.get_string_byte(@rbuf, @index)
+        Bytes.set_string_byte(buffer, i, byte)
         @index += 1
         i += 1
       end
       i
     end
 
-
-    def write(buf,sz=nil)
+    def write(buf, sz=nil)
       return @transport.write(buf) unless @write
 
+      buf = Bytes.force_binary_encoding(buf)
       @wbuf << (sz ? buf[0...sz] : buf)
     end
 
@@ -92,10 +92,11 @@
       return @transport.flush unless @write
 
       out = [@wbuf.length].pack('N')
+      # Array#pack should return a BINARY encoded String, so it shouldn't be necessary to force encoding
       out << @wbuf
       @transport.write(out)
       @transport.flush
-      @wbuf = ''
+      @wbuf = Bytes.empty_byte_buffer
     end
 
     private

diff --git a/lib/rb/lib/thrift/transport/http_client_transport.rb b/lib/rb/lib/thrift/transport/http_client_transport.rb
index 1ef0fab..07f74bc 100644
--- a/lib/rb/lib/thrift/transport/http_client_transport.rb
+++ b/lib/rb/lib/thrift/transport/http_client_transport.rb

@@ -29,12 +29,12 @@
     def initialize(url)
       @url = URI url
       @headers = {'Content-Type' => 'application/x-thrift'}
-      @outbuf = ""
+      @outbuf = Bytes.empty_byte_buffer
     end
 
     def open?; true end
     def read(sz); @inbuf.read sz end
-    def write(buf); @outbuf << buf end
+    def write(buf); @outbuf << Bytes.force_binary_encoding(buf) end
 
     def add_headers(headers)
       @headers = @headers.merge(headers)
@@ -42,11 +42,12 @@
 
     def flush
       http = Net::HTTP.new @url.host, @url.port
-      http.use_ssl = @url.scheme == "https"
+      http.use_ssl = @url.scheme == 'https'
       resp = http.post(@url.request_uri, @outbuf, @headers)
       data = resp.body
+      data = Bytes.force_binary_encoding(data)
       @inbuf = StringIO.new data
-      @outbuf = ""
+      @outbuf = Bytes.empty_byte_buffer
     end
   end
 end

diff --git a/lib/rb/lib/thrift/transport/io_stream_transport.rb b/lib/rb/lib/thrift/transport/io_stream_transport.rb
index be348aa..e3c8379 100644
--- a/lib/rb/lib/thrift/transport/io_stream_transport.rb
+++ b/lib/rb/lib/thrift/transport/io_stream_transport.rb

@@ -32,7 +32,7 @@
 
     def open?; not @input.closed? or not @output.closed? end
     def read(sz); @input.read(sz) end
-    def write(buf); @output.write(buf) end
+    def write(buf); @output.write(Bytes.force_binary_encoding(buf)) end
     def close; @input.close; @output.close end
     def to_io; @input end # we're assuming this is used in a IO.select for reading
   end

diff --git a/lib/rb/lib/thrift/transport/memory_buffer_transport.rb b/lib/rb/lib/thrift/transport/memory_buffer_transport.rb
index 62c5292..ad5ad85 100644
--- a/lib/rb/lib/thrift/transport/memory_buffer_transport.rb
+++ b/lib/rb/lib/thrift/transport/memory_buffer_transport.rb

@@ -28,7 +28,7 @@
     # this behavior is no longer required. If you wish to change it
     # go ahead, just make sure the specs pass
     def initialize(buffer = nil)
-      @buf = buffer || ''
+      @buf = buffer ? Bytes.force_binary_encoding(buffer) : Bytes.empty_byte_buffer
       @index = 0
     end
 
@@ -48,7 +48,7 @@
 
     # this method does not use the passed object directly but copies it
     def reset_buffer(new_buf = '')
-      @buf.replace new_buf
+      @buf.replace Bytes.force_binary_encoding(new_buf)
       @index = 0
     end
 
@@ -72,7 +72,7 @@
 
     def read_byte
       raise EOFError.new("Not enough bytes remain in buffer") if @index >= @buf.size
-      val = ::Thrift::TransportUtils.get_string_byte(@buf, @index)
+      val = Bytes.get_string_byte(@buf, @index)
       @index += 1
       if @index >= GARBAGE_BUFFER_SIZE
         @buf = @buf.slice(@index..-1)
@@ -87,8 +87,8 @@
         raise EOFError.new("Not enough bytes remain in buffer") if @index >= @buf.size
 
         # The read buffer has some data now, so copy bytes over to the output buffer.
-        byte = ::Thrift::TransportUtils.get_string_byte(@buf, @index)
-        ::Thrift::TransportUtils.set_string_byte(buffer, i, byte)
+        byte = Bytes.get_string_byte(@buf, @index)
+        Bytes.set_string_byte(buffer, i, byte)
         @index += 1
         i += 1
       end
@@ -100,7 +100,7 @@
     end
 
     def write(wbuf)
-      @buf << wbuf
+      @buf << Bytes.force_binary_encoding(wbuf)
     end
 
     def flush

diff --git a/lib/rb/lib/thrift/transport/socket.rb b/lib/rb/lib/thrift/transport/socket.rb
index 36461e9..2b7ca09 100644
--- a/lib/rb/lib/thrift/transport/socket.rb
+++ b/lib/rb/lib/thrift/transport/socket.rb

@@ -61,6 +61,7 @@
 
     def write(str)
       raise IOError, "closed stream" unless open?
+      str = Bytes.force_binary_encoding(str)
       begin
         if @timeout.nil? or @timeout == 0
           @handle.write(str)

diff --git a/lib/rb/spec/binary_protocol_spec_shared.rb b/lib/rb/spec/binary_protocol_spec_shared.rb
index ce4931f..c49ff1f 100644
--- a/lib/rb/spec/binary_protocol_spec_shared.rb
+++ b/lib/rb/spec/binary_protocol_spec_shared.rb

@@ -1,3 +1,4 @@
+# encoding: ascii-8bit
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements. See the NOTICE file
@@ -192,13 +193,41 @@
   it "should error gracefully when trying to write a nil double" do
     lambda { @prot.write_double(nil) }.should raise_error
   end
-  
-  it "should write a string" do
-    str = "hello world"
-    @prot.write_string(str)
-    @trans.read(@trans.available).should == [str.size].pack("N") + str
+
+  if RUBY_VERSION >= '1.9'
+    it 'should write a string' do
+      str = 'abc'
+      @prot.write_string(str)
+      a = @trans.read(@trans.available)
+      a.encoding.should == Encoding::BINARY
+      a.unpack('C*').should == [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63]
+    end
+
+    it 'should write a string with unicode characters' do
+      str = "abc \u20AC \u20AD".encode('UTF-8')
+      @prot.write_string(str)
+      a = @trans.read(@trans.available)
+      a.encoding.should == Encoding::BINARY
+      a.unpack('C*').should == [0x00, 0x00, 0x00, 0x0B, 0x61, 0x62, 0x63, 0x20,
+                                0xE2, 0x82, 0xAC, 0x20, 0xE2, 0x82, 0xAD]
+    end
+
+    it 'should write should write a string with unicode characters and transcoding' do
+      str = "abc \u20AC".encode('ISO-8859-15')
+      @prot.write_string(str)
+      a = @trans.read(@trans.available)
+      a.encoding.should == Encoding::BINARY
+      a.unpack('C*').should == [0x00, 0x00, 0x00, 0x07, 0x61, 0x62, 0x63, 0x20, 0xE2, 0x82, 0xAC]
+    end
+  else
+    it 'should write a string' do
+      str = 'abc'
+      @prot.write_string(str)
+      a = @trans.read(@trans.available)
+      a.unpack('C*').should == [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63]
+    end
   end
-  
+
   it "should error gracefully when trying to write a nil string" do
     lambda { @prot.write_string(nil) }.should raise_error
   end
@@ -294,11 +323,32 @@
       @prot.read_double.should == f
     end
   end
-  
-  it "should read a string" do
-    str = "hello world"
-    @trans.write([str.size].pack("N") + str)
-    @prot.read_string.should == str
+
+  if RUBY_VERSION >= '1.9'
+    it 'should read a string' do
+      # i32 of value 3, followed by three characters/UTF-8 bytes 'a', 'b', 'c'
+      buffer = [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63].pack('C*')
+      @trans.write(buffer)
+      a = @prot.read_string
+      a.should == 'abc'.encode('UTF-8')
+      a.encoding.should == Encoding::UTF_8
+    end
+
+    it 'should read a string containing unicode characters from UTF-8 encoded buffer' do
+      # i32 of value 3, followed by one character U+20AC made up of three bytes
+      buffer = [0x00, 0x00, 0x00, 0x03, 0xE2, 0x82, 0xAC].pack('C*')
+      @trans.write(buffer)
+      a = @prot.read_string
+      a.should == "\u20AC".encode('UTF-8')
+      a.encoding.should == Encoding::UTF_8
+    end
+  else
+    it 'should read a string' do
+      # i32 of value 3, followed by three characters/UTF-8 bytes 'a', 'b', 'c'
+      buffer = [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63].pack('C*')
+      @trans.write(buffer)
+      @prot.read_string.should == 'abc'
+    end
   end
 
   it "should perform a complete rpc with no args or return" do

diff --git a/lib/rb/spec/bytes_spec.rb b/lib/rb/spec/bytes_spec.rb
new file mode 100644
index 0000000..b82e304
--- /dev/null
+++ b/lib/rb/spec/bytes_spec.rb

@@ -0,0 +1,160 @@
+# encoding: UTF-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+require 'spec_helper'
+
+describe Thrift::Bytes do
+  if RUBY_VERSION >= '1.9'
+    describe '.empty_byte_buffer' do
+      it 'should create an empty buffer' do
+        b = Thrift::Bytes.empty_byte_buffer
+        b.length.should == 0
+        b.encoding.should == Encoding::BINARY
+      end
+
+      it 'should create an empty buffer of given size' do
+        b = Thrift::Bytes.empty_byte_buffer 2
+        b.length.should == 2
+        b.getbyte(0).should == 0
+        b.getbyte(1).should == 0
+        b.encoding.should == Encoding::BINARY
+      end
+    end
+
+    describe '.force_binary_encoding' do
+      it 'should change encoding' do
+        e = 'STRING'.encode('UTF-8')
+        e.encoding.should_not == Encoding::BINARY
+        a = Thrift::Bytes.force_binary_encoding e
+        a.encoding.should == Encoding::BINARY
+      end
+    end
+
+    describe '.get_string_byte' do
+      it 'should get the byte at index' do
+        s = "\x41\x42"
+        Thrift::Bytes.get_string_byte(s, 0).should == 0x41
+        Thrift::Bytes.get_string_byte(s, 1).should == 0x42
+      end
+    end
+
+    describe '.set_string_byte' do
+      it 'should set byte value at index' do
+        s = "\x41\x42"
+        Thrift::Bytes.set_string_byte(s, 0, 0x43)
+        s.getbyte(0).should == 0x43
+        s.should == 'CB'
+      end
+    end
+
+    describe '.convert_to_utf8_byte_buffer' do
+      it 'should convert UTF-8 String to byte buffer' do
+        e = "\u20AC".encode('UTF-8') # a string with euro sign character U+20AC
+        e.length.should == 1
+
+        a = Thrift::Bytes.convert_to_utf8_byte_buffer e
+        a.encoding.should == Encoding::BINARY
+        a.length.should == 3
+        a.unpack('C*').should == [0xE2, 0x82, 0xAC]
+      end
+
+      it 'should convert ISO-8859-15 String to UTF-8 byte buffer' do
+        # Assumptions
+        e = "\u20AC".encode('ISO-8859-15') # a string with euro sign character U+20AC, then converted to ISO-8859-15
+        e.length.should == 1
+        e.unpack('C*').should == [0xA4] # euro sign is a different code point in ISO-8859-15
+
+        a = Thrift::Bytes.convert_to_utf8_byte_buffer e
+        a.encoding.should == Encoding::BINARY
+        a.length.should == 3
+        a.unpack('C*').should == [0xE2, 0x82, 0xAC]
+      end
+    end
+
+    describe '.convert_to_string' do
+      it 'should convert UTF-8 byte buffer to a UTF-8 String' do
+        e = [0xE2, 0x82, 0xAC].pack("C*")
+        e.encoding.should == Encoding::BINARY
+        a = Thrift::Bytes.convert_to_string e
+        a.encoding.should == Encoding::UTF_8
+        a.should == "\u20AC"
+      end
+    end
+
+  else # RUBY_VERSION
+    describe '.empty_byte_buffer' do
+      it 'should create an empty buffer' do
+        b = Thrift::Bytes.empty_byte_buffer
+        b.length.should == 0
+      end
+
+      it 'should create an empty buffer of given size' do
+        b = Thrift::Bytes.empty_byte_buffer 2
+        b.length.should == 2
+        b[0].should == 0
+        b[1].should == 0
+      end
+    end
+
+    describe '.force_binary_encoding' do
+      it 'should be a no-op' do
+        e = 'STRING'
+        a = Thrift::Bytes.force_binary_encoding e
+        a.should == e
+        a.should be(e)
+      end
+    end
+
+    describe '.get_string_byte' do
+      it 'should get the byte at index' do
+        s = "\x41\x42"
+        Thrift::Bytes.get_string_byte(s, 0).should == 0x41
+        Thrift::Bytes.get_string_byte(s, 1).should == 0x42
+      end
+    end
+
+    describe '.set_string_byte' do
+      it 'should set byte value at index' do
+        s = "\x41\x42"
+        Thrift::Bytes.set_string_byte(s, 0, 0x43)
+        s[0].should == 0x43
+        s.should == 'CB'
+      end
+    end
+
+    describe '.convert_to_utf8_byte_buffer' do
+      it 'should be a no-op' do
+        e = 'STRING'
+        a = Thrift::Bytes.convert_to_utf8_byte_buffer e
+        a.should == e
+        a.should be(e)
+      end
+    end
+
+    describe '.convert_to_string' do
+      it 'should be a no-op' do
+        e = 'STRING'
+        a = Thrift::Bytes.convert_to_string e
+        a.should == e
+        a.should be(e)
+      end
+    end
+  end
+end

diff --git a/lib/rb/spec/compact_protocol_spec.rb b/lib/rb/spec/compact_protocol_spec.rb
index 13c6b83..91dfe44 100644
--- a/lib/rb/spec/compact_protocol_spec.rb
+++ b/lib/rb/spec/compact_protocol_spec.rb

@@ -1,3 +1,4 @@
+# encoding: UTF-8
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements. See the NOTICE file
@@ -25,7 +26,7 @@
     :i16 => (0..14).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort,
     :i32 => (0..30).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort,
     :i64 => (0..62).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort,
-    :string => ["", "1", "short", "fourteen123456", "fifteen12345678", "1" * 127, "1" * 3000],
+    :string => ["", "1", "short", "fourteen123456", "fifteen12345678", "unicode characters: \u20AC \u20AD", "1" * 127, "1" * 3000],
     :binary => ["", "\001", "\001" * 5, "\001" * 14, "\001" * 15, "\001" * 127, "\001" * 3000],
     :double => [0.0, 1.0, -1.0, 1.1, -1.1, 10000000.1, 1.0/0.0, -1.0/0.0],
     :bool => [true, false]

diff --git a/lib/rb/spec/json_protocol_spec.rb b/lib/rb/spec/json_protocol_spec.rb
index 3945925..a294ac5 100644
--- a/lib/rb/spec/json_protocol_spec.rb
+++ b/lib/rb/spec/json_protocol_spec.rb

@@ -1,3 +1,4 @@
+# encoding: UTF-8
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements. See the NOTICE file
@@ -220,9 +221,25 @@
       @trans.read(@trans.available).should == "\"-Infinity\""
     end
 
-    it "should write string" do
-      @prot.write_string("this is a test string")
-      @trans.read(@trans.available).should == "\"this is a test string\""
+    if RUBY_VERSION >= '1.9'
+      it 'should write string' do
+        @prot.write_string('this is a test string')
+        a = @trans.read(@trans.available)
+        a.should == '"this is a test string"'.force_encoding(Encoding::BINARY)
+        a.encoding.should == Encoding::BINARY
+      end
+
+      it 'should write string with unicode characters' do
+        @prot.write_string("this is a test string with unicode characters: \u20AC \u20AD")
+        a = @trans.read(@trans.available)
+        a.should == "\"this is a test string with unicode characters: \u20AC \u20AD\"".force_encoding(Encoding::BINARY)
+        a.encoding.should == Encoding::BINARY
+      end
+    else
+      it 'should write string' do
+        @prot.write_string('this is a test string')
+        @trans.read(@trans.available).should == '"this is a test string"'
+      end
     end
 
     it "should write binary" do
@@ -461,9 +478,25 @@
       @prot.read_double.should == 12.23
     end
 
-    it "should read string" do
-      @trans.write("\"this is a test string\"")
-      @prot.read_string.should == "this is a test string"
+    if RUBY_VERSION >= '1.9'
+      it 'should read string' do
+        @trans.write('"this is a test string"'.force_encoding(Encoding::BINARY))
+        a = @prot.read_string
+        a.should == 'this is a test string'
+        a.encoding.should == Encoding::UTF_8
+      end
+
+      it 'should read string with unicode characters' do
+        @trans.write('"this is a test string with unicode characters: \u20AC \u20AD"'.force_encoding(Encoding::BINARY))
+        a = @prot.read_string
+        a.should == "this is a test string with unicode characters: \u20AC \u20AD"
+        a.encoding.should == Encoding::UTF_8
+      end
+    else
+      it 'should read string' do
+        @trans.write('"this is a test string"')
+        @prot.read_string.should == 'this is a test string'
+      end
     end
 
     it "should read binary" do
commit	b5a18a150ee73003ab760f85023e5fcb3625c8e5	[log] [tgz]
author	Jake Farrell <jfarrell@apache.org>	Tue Oct 09 01:10:43 2012 +0000
committer	Jake Farrell <jfarrell@apache.org>	Tue Oct 09 01:10:43 2012 +0000
tree	7a721c8263485511291e94d877aef54f5b5c71ea
parent	fc35612d1dba14b47dadfed1c354f4d20be63e03 [diff]