Thrift-1023:Thrift encoding (UTF-8) issue with Ruby 1.9.2
Client: rb
Patch: Nathan Beyer
Fixes encoding issue for UTF-8 strings in ruby client.
git-svn-id: https://svn.apache.org/repos/asf/thrift/trunk@1395832 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/lib/rb/ext/binary_protocol_accelerated.c b/lib/rb/ext/binary_protocol_accelerated.c
index bd1c2da..a8ebe7f 100644
--- a/lib/rb/ext/binary_protocol_accelerated.c
+++ b/lib/rb/ext/binary_protocol_accelerated.c
@@ -22,7 +22,8 @@
#include <stdint.h>
#include <constants.h>
#include <struct.h>
-#include "macros.h"
+#include <macros.h>
+#include <bytes.h>
VALUE rb_thrift_binary_proto_native_qmark(VALUE self) {
return Qtrue;
@@ -80,6 +81,7 @@
if (TYPE(str) != T_STRING) {
rb_raise(rb_eStandardError, "Value should be a string");
}
+ str = convert_to_utf8_byte_buffer(str);
write_i32_direct(trans, RSTRING_LEN(str));
rb_funcall(trans, write_method_id, 1, str);
}
@@ -380,7 +382,8 @@
VALUE rb_thrift_binary_proto_read_string(VALUE self) {
int size = read_i32_direct(self);
- return READ(self, size);
+ VALUE buffer = READ(self, size);
+ return convert_to_string(buffer);
}
void Init_binary_protocol_accelerated() {
diff --git a/lib/rb/ext/bytes.c b/lib/rb/ext/bytes.c
new file mode 100644
index 0000000..8a6fac4
--- /dev/null
+++ b/lib/rb/ext/bytes.c
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <ruby.h>
+#ifdef HAVE_RUBY_ENCODING_H
+#include <ruby/encoding.h>
+#endif
+#include <constants.h>
+
+VALUE force_binary_encoding(VALUE buffer) {
+ return rb_funcall(thrift_bytes_module, force_binary_encoding_id, 1, buffer);
+}
+
+VALUE convert_to_utf8_byte_buffer(VALUE string) {
+ return rb_funcall(thrift_bytes_module, convert_to_utf8_byte_buffer_id, 1, string);
+}
+
+VALUE convert_to_string(VALUE utf8_buffer) {
+ return rb_funcall(thrift_bytes_module, convert_to_string_id, 1, utf8_buffer);
+}
diff --git a/lib/rb/ext/bytes.h b/lib/rb/ext/bytes.h
new file mode 100644
index 0000000..7108d83
--- /dev/null
+++ b/lib/rb/ext/bytes.h
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <ruby.h>
+
+/*
+ * A collection of utilities for working with bytes and byte buffers.
+ *
+ * These methods are the native analogies to some of the methods in
+ * Thrift::Bytes (thrift/bytes.rb).
+ */
+
+VALUE force_binary_encoding(VALUE buffer);
+VALUE convert_to_utf8_byte_buffer(VALUE string);
+VALUE convert_to_string(VALUE utf8_buffer);
diff --git a/lib/rb/ext/compact_protocol.c b/lib/rb/ext/compact_protocol.c
index a47fe6c..0c05481 100644
--- a/lib/rb/ext/compact_protocol.c
+++ b/lib/rb/ext/compact_protocol.c
@@ -20,9 +20,10 @@
#include <ruby.h>
#include <stdbool.h>
#include <stdint.h>
-#include "constants.h"
-#include "struct.h"
-#include "macros.h"
+#include <constants.h>
+#include <struct.h>
+#include <macros.h>
+#include <bytes.h>
#define LAST_ID(obj) FIX2INT(rb_ary_pop(rb_ivar_get(obj, last_field_id)))
#define SET_LAST_ID(obj, val) rb_ary_push(rb_ivar_get(obj, last_field_id), val)
@@ -305,6 +306,7 @@
VALUE rb_thrift_compact_proto_write_string(VALUE self, VALUE str) {
VALUE transport = GET_TRANSPORT(self);
+ str = convert_to_utf8_byte_buffer(str);
write_varint32(transport, RSTRING_LEN(str));
WRITE(transport, RSTRING_PTR(str), RSTRING_LEN(str));
return Qnil;
@@ -546,7 +548,8 @@
VALUE rb_thrift_compact_proto_read_string(VALUE self) {
int64_t size = read_varint64(self);
- return READ(self, size);
+ VALUE buffer = READ(self, size);
+ return convert_to_string(buffer);
}
static void Init_constants() {
diff --git a/lib/rb/ext/constants.h b/lib/rb/ext/constants.h
index 9ea00d2..3bfac88 100644
--- a/lib/rb/ext/constants.h
+++ b/lib/rb/ext/constants.h
@@ -76,6 +76,9 @@
extern ID read_all_method_id;
extern ID read_into_buffer_method_id;
extern ID native_qmark_method_id;
+extern ID force_binary_encoding_id;
+extern ID convert_to_utf8_byte_buffer_id;
+extern ID convert_to_string_id;
extern ID fields_const_id;
extern ID transport_ivar_id;
@@ -92,5 +95,6 @@
extern VALUE rb_cSet;
extern VALUE thrift_module;
extern VALUE thrift_types_module;
+extern VALUE thrift_bytes_module;
extern VALUE class_thrift_protocol;
extern VALUE protocol_exception_class;
diff --git a/lib/rb/ext/memory_buffer.c b/lib/rb/ext/memory_buffer.c
index 319b073..e7253dc 100644
--- a/lib/rb/ext/memory_buffer.c
+++ b/lib/rb/ext/memory_buffer.c
@@ -19,7 +19,8 @@
#include <ruby.h>
#include <constants.h>
-#include "macros.h"
+#include <bytes.h>
+#include <macros.h>
ID buf_ivar_id;
ID index_ivar_id;
@@ -37,6 +38,7 @@
VALUE rb_thrift_memory_buffer_write(VALUE self, VALUE str) {
VALUE buf = GET_BUF(self);
+ str = force_binary_encoding(str);
rb_str_buf_cat(buf, RSTRING_PTR(str), RSTRING_LEN(str));
return Qnil;
}
diff --git a/lib/rb/ext/thrift_native.c b/lib/rb/ext/thrift_native.c
index 2f6bb1a..f066d6c 100644
--- a/lib/rb/ext/thrift_native.c
+++ b/lib/rb/ext/thrift_native.c
@@ -18,6 +18,7 @@
*/
#include <ruby.h>
+#include <bytes.h>
#include <struct.h>
#include <binary_protocol_accelerated.h>
#include <compact_protocol.h>
@@ -27,6 +28,7 @@
// cached classes/modules
VALUE rb_cSet;
VALUE thrift_module;
+VALUE thrift_bytes_module;
VALUE thrift_types_module;
// TType constants
@@ -90,6 +92,9 @@
ID read_all_method_id;
ID read_into_buffer_method_id;
ID native_qmark_method_id;
+ID force_binary_encoding_id;
+ID convert_to_utf8_byte_buffer_id;
+ID convert_to_string_id;
// constant ids
ID fields_const_id;
@@ -109,6 +114,7 @@
void Init_thrift_native() {
// cached classes
thrift_module = rb_const_get(rb_cObject, rb_intern("Thrift"));
+ thrift_bytes_module = rb_const_get(thrift_module, rb_intern("Bytes"));
thrift_types_module = rb_const_get(thrift_module, rb_intern("Types"));
rb_cSet = rb_const_get(rb_cObject, rb_intern("Set"));
protocol_exception_class = rb_const_get(thrift_module, rb_intern("ProtocolException"));
@@ -173,6 +179,9 @@
read_all_method_id = rb_intern("read_all");
read_into_buffer_method_id = rb_intern("read_into_buffer");
native_qmark_method_id = rb_intern("native?");
+ force_binary_encoding_id = rb_intern("force_binary_encoding");
+ convert_to_utf8_byte_buffer_id = rb_intern("convert_to_utf8_byte_buffer");
+ convert_to_string_id = rb_intern("convert_to_string");
// constant ids
fields_const_id = rb_intern("FIELDS");
diff --git a/lib/rb/lib/thrift.rb b/lib/rb/lib/thrift.rb
index 72050b1..fb9e04a 100644
--- a/lib/rb/lib/thrift.rb
+++ b/lib/rb/lib/thrift.rb
@@ -22,6 +22,7 @@
$:.unshift File.dirname(__FILE__)
+require 'thrift/bytes'
require 'thrift/core_ext'
require 'thrift/exceptions'
require 'thrift/types'
diff --git a/lib/rb/lib/thrift/bytes.rb b/lib/rb/lib/thrift/bytes.rb
new file mode 100644
index 0000000..efd4f64
--- /dev/null
+++ b/lib/rb/lib/thrift/bytes.rb
@@ -0,0 +1,131 @@
+# encoding: ascii-8bit
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+module Thrift
+ # A collection of utilities for working with bytes and byte buffers.
+ module Bytes
+ if RUBY_VERSION >= '1.9'
+ # Creates and empty byte buffer (String with BINARY encoding)
+ #
+ # size - The Integer size of the buffer (default: nil) to create
+ #
+ # Returns a String with BINARY encoding, filled with null characters
+ # if size is greater than zero
+ def self.empty_byte_buffer(size = nil)
+ if (size && size > 0)
+ "\0".force_encoding(Encoding::BINARY) * size
+ else
+ ''.force_encoding(Encoding::BINARY)
+ end
+ end
+
+ # Forces the encoding of the buffer to BINARY. If the buffer
+ # passed is frozen, then it will be duplicated.
+ #
+ # buffer - The String to force the encoding of.
+ #
+ # Returns the String passed with an encoding of BINARY; returned
+ # String may be a duplicate.
+ def self.force_binary_encoding(buffer)
+ buffer = buffer.dup if buffer.frozen?
+ buffer.force_encoding(Encoding::BINARY)
+ end
+
+ # Gets the byte value of a given position in a String.
+ #
+ # string - The String to retrive the byte value from.
+ # index - The Integer location of the byte value to retrieve.
+ #
+ # Returns an Integer value between 0 and 255.
+ def self.get_string_byte(string, index)
+ string.getbyte(index)
+ end
+
+ # Sets the byte value given to a given index in a String.
+ #
+ # string - The String to set the byte value in.
+ # index - The Integer location to set the byte value at.
+ # byte - The Integer value (0 to 255) to set in the string.
+ #
+ # Returns an Integer value of the byte value to set.
+ def self.set_string_byte(string, index, byte)
+ string.setbyte(index, byte)
+ end
+
+ # Converts the given String to a UTF-8 byte buffer.
+ #
+ # string - The String to convert.
+ #
+ # Returns a new String with BINARY encoding, containing the UTF-8
+ # bytes of the original string.
+ def self.convert_to_utf8_byte_buffer(string)
+ if string.encoding != Encoding::UTF_8
+ # transcode to UTF-8
+ string = string.encode(Encoding::UTF_8)
+ else
+ # encoding is already UTF-8, but a duplicate is needed
+ string = string.dup
+ end
+ string.force_encoding(Encoding::BINARY)
+ end
+
+ # Converts the given UTF-8 byte buffer into a String
+ #
+ # utf8_buffer - A String, with BINARY encoding, containing UTF-8 bytes
+ #
+ # Returns a new String with UTF-8 encoding,
+ def self.convert_to_string(utf8_buffer)
+ # duplicate the buffer, force encoding to UTF-8
+ utf8_buffer.dup.force_encoding(Encoding::UTF_8)
+ end
+ else
+ def self.empty_byte_buffer(size = nil)
+ if (size && size > 0)
+ "\0" * size
+ else
+ ''
+ end
+ end
+
+ def self.force_binary_encoding(buffer)
+ buffer
+ end
+
+ def self.get_string_byte(string, index)
+ string[index]
+ end
+
+ def self.set_string_byte(string, index, byte)
+ string[index] = byte
+ end
+
+ def self.convert_to_utf8_byte_buffer(string)
+ # This assumes $KCODE is 'UTF8'/'U', which would mean the String is already a UTF-8 byte buffer
+ # TODO consider handling other $KCODE values and transcoding with iconv
+ string
+ end
+
+ def self.convert_to_string(utf8_buffer)
+ # See comment in 'convert_to_utf8_byte_buffer' for relevant assumptions.
+ utf8_buffer
+ end
+ end
+ end
+end
diff --git a/lib/rb/lib/thrift/protocol/base_protocol.rb b/lib/rb/lib/thrift/protocol/base_protocol.rb
index b19909d..a5a174d 100644
--- a/lib/rb/lib/thrift/protocol/base_protocol.rb
+++ b/lib/rb/lib/thrift/protocol/base_protocol.rb
@@ -114,6 +114,13 @@
raise NotImplementedError
end
+ # Writes a Thrift String. In Ruby 1.9+, the String passed will be transcoded to UTF-8.
+ #
+ # str - The String to write.
+ #
+ # Raises EncodingError if the transcoding to UTF-8 fails.
+ #
+ # Returns nothing.
def write_string(str)
raise NotImplementedError
end
@@ -178,6 +185,9 @@
raise NotImplementedError
end
+ # Reads a Thrift String. In Ruby 1.9+, all String will be returned with an Encoding of UTF-8.
+ #
+ # Returns a String.
def read_string
raise NotImplementedError
end
diff --git a/lib/rb/lib/thrift/protocol/binary_protocol.rb b/lib/rb/lib/thrift/protocol/binary_protocol.rb
index f9adb20..2528276 100644
--- a/lib/rb/lib/thrift/protocol/binary_protocol.rb
+++ b/lib/rb/lib/thrift/protocol/binary_protocol.rb
@@ -32,8 +32,7 @@
# Pre-allocated read buffer for fixed-size read methods. Needs to be at least 8 bytes long for
# read_i64() and read_double().
- @rbuf = "\0" * 8
- @rbuf.force_encoding("BINARY") if @rbuf.respond_to?(:force_encoding)
+ @rbuf = Bytes.empty_byte_buffer(8)
end
def write_message_begin(name, type, seqid)
@@ -108,6 +107,7 @@
end
def write_string(str)
+ str = Bytes.convert_to_utf8_byte_buffer(str)
write_i32(str.length)
trans.write(str)
end
@@ -214,9 +214,9 @@
end
def read_string
- sz = read_i32
- dat = trans.read_all(sz)
- dat
+ size = read_i32
+ buffer = trans.read_all(size)
+ Bytes.convert_to_string(buffer)
end
end
diff --git a/lib/rb/lib/thrift/protocol/compact_protocol.rb b/lib/rb/lib/thrift/protocol/compact_protocol.rb
index ede82f2..758e1ae 100644
--- a/lib/rb/lib/thrift/protocol/compact_protocol.rb
+++ b/lib/rb/lib/thrift/protocol/compact_protocol.rb
@@ -100,8 +100,7 @@
@boolean_value = nil
# Pre-allocated read buffer for read_double().
- @rbuf = "\0" * 8
- @rbuf.force_encoding("BINARY") if @rbuf.respond_to?(:force_encoding)
+ @rbuf = Bytes.empty_byte_buffer(8)
end
def write_message_begin(name, type, seqid)
@@ -211,6 +210,7 @@
end
def write_string(str)
+ str = Bytes.convert_to_utf8_byte_buffer(str)
write_varint32(str.length)
@trans.write(str)
end
@@ -333,7 +333,8 @@
def read_string
size = read_varint32()
- trans.read_all(size)
+ buffer = trans.read_all(size)
+ Bytes.convert_to_string(buffer)
end
diff --git a/lib/rb/lib/thrift/protocol/json_protocol.rb b/lib/rb/lib/thrift/protocol/json_protocol.rb
index ddbf193..6f8d1d1 100644
--- a/lib/rb/lib/thrift/protocol/json_protocol.rb
+++ b/lib/rb/lib/thrift/protocol/json_protocol.rb
@@ -1,3 +1,4 @@
+# encoding: UTF-8
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
@@ -482,13 +483,21 @@
end
# Decodes the four hex parts of a JSON escaped string character and returns
- # the character via out. The first two characters must be "00".
+ # the character via out.
+ #
+ # Note - this only supports Unicode characters in the BMP (U+0000 to U+FFFF);
+ # characters above the BMP are encoded as two escape sequences (surrogate pairs),
+ # which is not yet implemented
def read_json_escape_char
- read_json_syntax_char('0')
- read_json_syntax_char('0')
str = @reader.read
str += @reader.read
- str.hex.chr
+ str += @reader.read
+ str += @reader.read
+ if RUBY_VERSION >= '1.9'
+ str.hex.chr(Encoding::UTF_8)
+ else
+ str.hex.chr
+ end
end
# Decodes a JSON string, including unescaping, and returns the string via str
diff --git a/lib/rb/lib/thrift/transport/base_transport.rb b/lib/rb/lib/thrift/transport/base_transport.rb
index 0a12cea..8790326 100644
--- a/lib/rb/lib/thrift/transport/base_transport.rb
+++ b/lib/rb/lib/thrift/transport/base_transport.rb
@@ -35,22 +35,14 @@
end
module TransportUtils
- if RUBY_VERSION >= '1.9'
- def self.get_string_byte(string, index)
- string.getbyte(index)
- end
+ # Deprecated: Use Thrift::Bytes instead
+ def self.get_string_byte(string, index)
+ Bytes.get_string_byte(string, index)
+ end
- def self.set_string_byte(string, index, byte)
- string.setbyte(index, byte)
- end
- else
- def self.get_string_byte(string, index)
- string[index]
- end
-
- def self.set_string_byte(string, index, byte)
- string[index] = byte
- end
+ # Deprecated: Use Thrift::Bytes instead
+ def self.set_string_byte(string, index, byte)
+ Bytes.set_string_byte(string, index, byte)
end
end
@@ -61,6 +53,11 @@
def close; end
+ # Reads a number of bytes from the transports. In Ruby 1.9+, the String returned will have a BINARY (aka ASCII8BIT) encoding.
+ #
+ # sz - The number of bytes to read from the transport.
+ #
+ # Returns a String acting as a byte buffer.
def read(sz)
raise NotImplementedError
end
@@ -68,7 +65,7 @@
# Returns an unsigned byte as a Fixnum in the range (0..255).
def read_byte
buf = read_all(1)
- return ::Thrift::TransportUtils.get_string_byte(buf, 0)
+ return Bytes.get_string_byte(buf, 0)
end
# Reads size bytes and copies them into buffer[0..size].
@@ -76,14 +73,14 @@
tmp = read_all(size)
i = 0
tmp.each_byte do |byte|
- ::Thrift::TransportUtils.set_string_byte(buffer, i, byte)
+ Bytes.set_string_byte(buffer, i, byte)
i += 1
end
i
end
def read_all(size)
- return '' if size <= 0
+ return Bytes.empty_byte_buffer if size <= 0
buf = read(size)
while (buf.length < size)
chunk = read(size - buf.length)
@@ -92,7 +89,12 @@
buf
end
-
+
+ # Writes the byte buffer to the transport. In Ruby 1.9+, the buffer will be forced into BINARY encoding.
+ #
+ # buf - A String acting as a byte buffer.
+ #
+ # Returns nothing.
def write(buf); end
alias_method :<<, :write
@@ -104,4 +106,4 @@
return trans
end
end
-end
\ No newline at end of file
+end
diff --git a/lib/rb/lib/thrift/transport/buffered_transport.rb b/lib/rb/lib/thrift/transport/buffered_transport.rb
index 676a4d3..781d3c6 100644
--- a/lib/rb/lib/thrift/transport/buffered_transport.rb
+++ b/lib/rb/lib/thrift/transport/buffered_transport.rb
@@ -24,8 +24,8 @@
def initialize(transport)
@transport = transport
- @wbuf = ''
- @rbuf = ''
+ @wbuf = Bytes.empty_byte_buffer
+ @rbuf = Bytes.empty_byte_buffer
@index = 0
end
@@ -44,12 +44,12 @@
def read(sz)
@index += sz
- ret = @rbuf.slice(@index - sz, sz) || ''
+ ret = @rbuf.slice(@index - sz, sz) || Bytes.empty_byte_buffer
if ret.length == 0
@rbuf = @transport.read([sz, DEFAULT_BUFFER].max)
@index = sz
- ret = @rbuf.slice(0, sz) || ''
+ ret = @rbuf.slice(0, sz) || Bytes.empty_byte_buffer
end
ret
@@ -65,9 +65,15 @@
# The read buffer has some data now, read a single byte. Using get_string_byte() avoids
# allocating a temp string of size 1 unnecessarily.
@index += 1
- return ::Thrift::TransportUtils.get_string_byte(@rbuf, @index - 1)
+ return Bytes.get_string_byte(@rbuf, @index - 1)
end
+ # Reads a number of bytes from the transport into the buffer passed.
+ #
+ # buffer - The String (byte buffer) to write data to; this is assumed to have a BINARY encoding.
+ # size - The number of bytes to read from the transport and write to the buffer.
+ #
+ # Returns the number of bytes read.
def read_into_buffer(buffer, size)
i = 0
while i < size
@@ -78,8 +84,8 @@
end
# The read buffer has some data now, so copy bytes over to the output buffer.
- byte = ::Thrift::TransportUtils.get_string_byte(@rbuf, @index)
- ::Thrift::TransportUtils.set_string_byte(buffer, i, byte)
+ byte = Bytes.get_string_byte(@rbuf, @index)
+ Bytes.set_string_byte(buffer, i, byte)
@index += 1
i += 1
end
@@ -87,13 +93,13 @@
end
def write(buf)
- @wbuf << buf
+ @wbuf << Bytes.force_binary_encoding(buf)
end
def flush
- if @wbuf != ''
+ unless @wbuf.empty?
@transport.write(@wbuf)
- @wbuf = ''
+ @wbuf = Bytes.empty_byte_buffer
end
@transport.flush
diff --git a/lib/rb/lib/thrift/transport/framed_transport.rb b/lib/rb/lib/thrift/transport/framed_transport.rb
index e7630d0..d806ce0 100644
--- a/lib/rb/lib/thrift/transport/framed_transport.rb
+++ b/lib/rb/lib/thrift/transport/framed_transport.rb
@@ -22,8 +22,8 @@
class FramedTransport < BaseTransport
def initialize(transport, read=true, write=true)
@transport = transport
- @rbuf = ''
- @wbuf = ''
+ @rbuf = Bytes.empty_byte_buffer
+ @wbuf = Bytes.empty_byte_buffer
@read = read
@write = write
@index = 0
@@ -44,12 +44,12 @@
def read(sz)
return @transport.read(sz) unless @read
- return '' if sz <= 0
+ return Bytes.empty_byte_buffer if sz <= 0
read_frame if @index >= @rbuf.length
@index += sz
- @rbuf.slice(@index - sz, sz) || ''
+ @rbuf.slice(@index - sz, sz) || Bytes.empty_byte_buffer
end
def read_byte
@@ -60,7 +60,7 @@
# The read buffer has some data now, read a single byte. Using get_string_byte() avoids
# allocating a temp string of size 1 unnecessarily.
@index += 1
- return ::Thrift::TransportUtils.get_string_byte(@rbuf, @index - 1)
+ return Bytes.get_string_byte(@rbuf, @index - 1)
end
def read_into_buffer(buffer, size)
@@ -69,18 +69,18 @@
read_frame if @index >= @rbuf.length
# The read buffer has some data now, so copy bytes over to the output buffer.
- byte = ::Thrift::TransportUtils.get_string_byte(@rbuf, @index)
- ::Thrift::TransportUtils.set_string_byte(buffer, i, byte)
+ byte = Bytes.get_string_byte(@rbuf, @index)
+ Bytes.set_string_byte(buffer, i, byte)
@index += 1
i += 1
end
i
end
-
- def write(buf,sz=nil)
+ def write(buf, sz=nil)
return @transport.write(buf) unless @write
+ buf = Bytes.force_binary_encoding(buf)
@wbuf << (sz ? buf[0...sz] : buf)
end
@@ -92,10 +92,11 @@
return @transport.flush unless @write
out = [@wbuf.length].pack('N')
+ # Array#pack should return a BINARY encoded String, so it shouldn't be necessary to force encoding
out << @wbuf
@transport.write(out)
@transport.flush
- @wbuf = ''
+ @wbuf = Bytes.empty_byte_buffer
end
private
diff --git a/lib/rb/lib/thrift/transport/http_client_transport.rb b/lib/rb/lib/thrift/transport/http_client_transport.rb
index 1ef0fab..07f74bc 100644
--- a/lib/rb/lib/thrift/transport/http_client_transport.rb
+++ b/lib/rb/lib/thrift/transport/http_client_transport.rb
@@ -29,12 +29,12 @@
def initialize(url)
@url = URI url
@headers = {'Content-Type' => 'application/x-thrift'}
- @outbuf = ""
+ @outbuf = Bytes.empty_byte_buffer
end
def open?; true end
def read(sz); @inbuf.read sz end
- def write(buf); @outbuf << buf end
+ def write(buf); @outbuf << Bytes.force_binary_encoding(buf) end
def add_headers(headers)
@headers = @headers.merge(headers)
@@ -42,11 +42,12 @@
def flush
http = Net::HTTP.new @url.host, @url.port
- http.use_ssl = @url.scheme == "https"
+ http.use_ssl = @url.scheme == 'https'
resp = http.post(@url.request_uri, @outbuf, @headers)
data = resp.body
+ data = Bytes.force_binary_encoding(data)
@inbuf = StringIO.new data
- @outbuf = ""
+ @outbuf = Bytes.empty_byte_buffer
end
end
end
diff --git a/lib/rb/lib/thrift/transport/io_stream_transport.rb b/lib/rb/lib/thrift/transport/io_stream_transport.rb
index be348aa..e3c8379 100644
--- a/lib/rb/lib/thrift/transport/io_stream_transport.rb
+++ b/lib/rb/lib/thrift/transport/io_stream_transport.rb
@@ -32,7 +32,7 @@
def open?; not @input.closed? or not @output.closed? end
def read(sz); @input.read(sz) end
- def write(buf); @output.write(buf) end
+ def write(buf); @output.write(Bytes.force_binary_encoding(buf)) end
def close; @input.close; @output.close end
def to_io; @input end # we're assuming this is used in a IO.select for reading
end
diff --git a/lib/rb/lib/thrift/transport/memory_buffer_transport.rb b/lib/rb/lib/thrift/transport/memory_buffer_transport.rb
index 62c5292..ad5ad85 100644
--- a/lib/rb/lib/thrift/transport/memory_buffer_transport.rb
+++ b/lib/rb/lib/thrift/transport/memory_buffer_transport.rb
@@ -28,7 +28,7 @@
# this behavior is no longer required. If you wish to change it
# go ahead, just make sure the specs pass
def initialize(buffer = nil)
- @buf = buffer || ''
+ @buf = buffer ? Bytes.force_binary_encoding(buffer) : Bytes.empty_byte_buffer
@index = 0
end
@@ -48,7 +48,7 @@
# this method does not use the passed object directly but copies it
def reset_buffer(new_buf = '')
- @buf.replace new_buf
+ @buf.replace Bytes.force_binary_encoding(new_buf)
@index = 0
end
@@ -72,7 +72,7 @@
def read_byte
raise EOFError.new("Not enough bytes remain in buffer") if @index >= @buf.size
- val = ::Thrift::TransportUtils.get_string_byte(@buf, @index)
+ val = Bytes.get_string_byte(@buf, @index)
@index += 1
if @index >= GARBAGE_BUFFER_SIZE
@buf = @buf.slice(@index..-1)
@@ -87,8 +87,8 @@
raise EOFError.new("Not enough bytes remain in buffer") if @index >= @buf.size
# The read buffer has some data now, so copy bytes over to the output buffer.
- byte = ::Thrift::TransportUtils.get_string_byte(@buf, @index)
- ::Thrift::TransportUtils.set_string_byte(buffer, i, byte)
+ byte = Bytes.get_string_byte(@buf, @index)
+ Bytes.set_string_byte(buffer, i, byte)
@index += 1
i += 1
end
@@ -100,7 +100,7 @@
end
def write(wbuf)
- @buf << wbuf
+ @buf << Bytes.force_binary_encoding(wbuf)
end
def flush
diff --git a/lib/rb/lib/thrift/transport/socket.rb b/lib/rb/lib/thrift/transport/socket.rb
index 36461e9..2b7ca09 100644
--- a/lib/rb/lib/thrift/transport/socket.rb
+++ b/lib/rb/lib/thrift/transport/socket.rb
@@ -61,6 +61,7 @@
def write(str)
raise IOError, "closed stream" unless open?
+ str = Bytes.force_binary_encoding(str)
begin
if @timeout.nil? or @timeout == 0
@handle.write(str)
diff --git a/lib/rb/spec/binary_protocol_spec_shared.rb b/lib/rb/spec/binary_protocol_spec_shared.rb
index ce4931f..c49ff1f 100644
--- a/lib/rb/spec/binary_protocol_spec_shared.rb
+++ b/lib/rb/spec/binary_protocol_spec_shared.rb
@@ -1,3 +1,4 @@
+# encoding: ascii-8bit
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
@@ -192,13 +193,41 @@
it "should error gracefully when trying to write a nil double" do
lambda { @prot.write_double(nil) }.should raise_error
end
-
- it "should write a string" do
- str = "hello world"
- @prot.write_string(str)
- @trans.read(@trans.available).should == [str.size].pack("N") + str
+
+ if RUBY_VERSION >= '1.9'
+ it 'should write a string' do
+ str = 'abc'
+ @prot.write_string(str)
+ a = @trans.read(@trans.available)
+ a.encoding.should == Encoding::BINARY
+ a.unpack('C*').should == [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63]
+ end
+
+ it 'should write a string with unicode characters' do
+ str = "abc \u20AC \u20AD".encode('UTF-8')
+ @prot.write_string(str)
+ a = @trans.read(@trans.available)
+ a.encoding.should == Encoding::BINARY
+ a.unpack('C*').should == [0x00, 0x00, 0x00, 0x0B, 0x61, 0x62, 0x63, 0x20,
+ 0xE2, 0x82, 0xAC, 0x20, 0xE2, 0x82, 0xAD]
+ end
+
+ it 'should write should write a string with unicode characters and transcoding' do
+ str = "abc \u20AC".encode('ISO-8859-15')
+ @prot.write_string(str)
+ a = @trans.read(@trans.available)
+ a.encoding.should == Encoding::BINARY
+ a.unpack('C*').should == [0x00, 0x00, 0x00, 0x07, 0x61, 0x62, 0x63, 0x20, 0xE2, 0x82, 0xAC]
+ end
+ else
+ it 'should write a string' do
+ str = 'abc'
+ @prot.write_string(str)
+ a = @trans.read(@trans.available)
+ a.unpack('C*').should == [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63]
+ end
end
-
+
it "should error gracefully when trying to write a nil string" do
lambda { @prot.write_string(nil) }.should raise_error
end
@@ -294,11 +323,32 @@
@prot.read_double.should == f
end
end
-
- it "should read a string" do
- str = "hello world"
- @trans.write([str.size].pack("N") + str)
- @prot.read_string.should == str
+
+ if RUBY_VERSION >= '1.9'
+ it 'should read a string' do
+ # i32 of value 3, followed by three characters/UTF-8 bytes 'a', 'b', 'c'
+ buffer = [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63].pack('C*')
+ @trans.write(buffer)
+ a = @prot.read_string
+ a.should == 'abc'.encode('UTF-8')
+ a.encoding.should == Encoding::UTF_8
+ end
+
+ it 'should read a string containing unicode characters from UTF-8 encoded buffer' do
+ # i32 of value 3, followed by one character U+20AC made up of three bytes
+ buffer = [0x00, 0x00, 0x00, 0x03, 0xE2, 0x82, 0xAC].pack('C*')
+ @trans.write(buffer)
+ a = @prot.read_string
+ a.should == "\u20AC".encode('UTF-8')
+ a.encoding.should == Encoding::UTF_8
+ end
+ else
+ it 'should read a string' do
+ # i32 of value 3, followed by three characters/UTF-8 bytes 'a', 'b', 'c'
+ buffer = [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63].pack('C*')
+ @trans.write(buffer)
+ @prot.read_string.should == 'abc'
+ end
end
it "should perform a complete rpc with no args or return" do
diff --git a/lib/rb/spec/bytes_spec.rb b/lib/rb/spec/bytes_spec.rb
new file mode 100644
index 0000000..b82e304
--- /dev/null
+++ b/lib/rb/spec/bytes_spec.rb
@@ -0,0 +1,160 @@
+# encoding: UTF-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+require 'spec_helper'
+
+describe Thrift::Bytes do
+ if RUBY_VERSION >= '1.9'
+ describe '.empty_byte_buffer' do
+ it 'should create an empty buffer' do
+ b = Thrift::Bytes.empty_byte_buffer
+ b.length.should == 0
+ b.encoding.should == Encoding::BINARY
+ end
+
+ it 'should create an empty buffer of given size' do
+ b = Thrift::Bytes.empty_byte_buffer 2
+ b.length.should == 2
+ b.getbyte(0).should == 0
+ b.getbyte(1).should == 0
+ b.encoding.should == Encoding::BINARY
+ end
+ end
+
+ describe '.force_binary_encoding' do
+ it 'should change encoding' do
+ e = 'STRING'.encode('UTF-8')
+ e.encoding.should_not == Encoding::BINARY
+ a = Thrift::Bytes.force_binary_encoding e
+ a.encoding.should == Encoding::BINARY
+ end
+ end
+
+ describe '.get_string_byte' do
+ it 'should get the byte at index' do
+ s = "\x41\x42"
+ Thrift::Bytes.get_string_byte(s, 0).should == 0x41
+ Thrift::Bytes.get_string_byte(s, 1).should == 0x42
+ end
+ end
+
+ describe '.set_string_byte' do
+ it 'should set byte value at index' do
+ s = "\x41\x42"
+ Thrift::Bytes.set_string_byte(s, 0, 0x43)
+ s.getbyte(0).should == 0x43
+ s.should == 'CB'
+ end
+ end
+
+ describe '.convert_to_utf8_byte_buffer' do
+ it 'should convert UTF-8 String to byte buffer' do
+ e = "\u20AC".encode('UTF-8') # a string with euro sign character U+20AC
+ e.length.should == 1
+
+ a = Thrift::Bytes.convert_to_utf8_byte_buffer e
+ a.encoding.should == Encoding::BINARY
+ a.length.should == 3
+ a.unpack('C*').should == [0xE2, 0x82, 0xAC]
+ end
+
+ it 'should convert ISO-8859-15 String to UTF-8 byte buffer' do
+ # Assumptions
+ e = "\u20AC".encode('ISO-8859-15') # a string with euro sign character U+20AC, then converted to ISO-8859-15
+ e.length.should == 1
+ e.unpack('C*').should == [0xA4] # euro sign is a different code point in ISO-8859-15
+
+ a = Thrift::Bytes.convert_to_utf8_byte_buffer e
+ a.encoding.should == Encoding::BINARY
+ a.length.should == 3
+ a.unpack('C*').should == [0xE2, 0x82, 0xAC]
+ end
+ end
+
+ describe '.convert_to_string' do
+ it 'should convert UTF-8 byte buffer to a UTF-8 String' do
+ e = [0xE2, 0x82, 0xAC].pack("C*")
+ e.encoding.should == Encoding::BINARY
+ a = Thrift::Bytes.convert_to_string e
+ a.encoding.should == Encoding::UTF_8
+ a.should == "\u20AC"
+ end
+ end
+
+ else # RUBY_VERSION
+ describe '.empty_byte_buffer' do
+ it 'should create an empty buffer' do
+ b = Thrift::Bytes.empty_byte_buffer
+ b.length.should == 0
+ end
+
+ it 'should create an empty buffer of given size' do
+ b = Thrift::Bytes.empty_byte_buffer 2
+ b.length.should == 2
+ b[0].should == 0
+ b[1].should == 0
+ end
+ end
+
+ describe '.force_binary_encoding' do
+ it 'should be a no-op' do
+ e = 'STRING'
+ a = Thrift::Bytes.force_binary_encoding e
+ a.should == e
+ a.should be(e)
+ end
+ end
+
+ describe '.get_string_byte' do
+ it 'should get the byte at index' do
+ s = "\x41\x42"
+ Thrift::Bytes.get_string_byte(s, 0).should == 0x41
+ Thrift::Bytes.get_string_byte(s, 1).should == 0x42
+ end
+ end
+
+ describe '.set_string_byte' do
+ it 'should set byte value at index' do
+ s = "\x41\x42"
+ Thrift::Bytes.set_string_byte(s, 0, 0x43)
+ s[0].should == 0x43
+ s.should == 'CB'
+ end
+ end
+
+ describe '.convert_to_utf8_byte_buffer' do
+ it 'should be a no-op' do
+ e = 'STRING'
+ a = Thrift::Bytes.convert_to_utf8_byte_buffer e
+ a.should == e
+ a.should be(e)
+ end
+ end
+
+ describe '.convert_to_string' do
+ it 'should be a no-op' do
+ e = 'STRING'
+ a = Thrift::Bytes.convert_to_string e
+ a.should == e
+ a.should be(e)
+ end
+ end
+ end
+end
diff --git a/lib/rb/spec/compact_protocol_spec.rb b/lib/rb/spec/compact_protocol_spec.rb
index 13c6b83..91dfe44 100644
--- a/lib/rb/spec/compact_protocol_spec.rb
+++ b/lib/rb/spec/compact_protocol_spec.rb
@@ -1,3 +1,4 @@
+# encoding: UTF-8
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
@@ -25,7 +26,7 @@
:i16 => (0..14).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort,
:i32 => (0..30).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort,
:i64 => (0..62).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort,
- :string => ["", "1", "short", "fourteen123456", "fifteen12345678", "1" * 127, "1" * 3000],
+ :string => ["", "1", "short", "fourteen123456", "fifteen12345678", "unicode characters: \u20AC \u20AD", "1" * 127, "1" * 3000],
:binary => ["", "\001", "\001" * 5, "\001" * 14, "\001" * 15, "\001" * 127, "\001" * 3000],
:double => [0.0, 1.0, -1.0, 1.1, -1.1, 10000000.1, 1.0/0.0, -1.0/0.0],
:bool => [true, false]
diff --git a/lib/rb/spec/json_protocol_spec.rb b/lib/rb/spec/json_protocol_spec.rb
index 3945925..a294ac5 100644
--- a/lib/rb/spec/json_protocol_spec.rb
+++ b/lib/rb/spec/json_protocol_spec.rb
@@ -1,3 +1,4 @@
+# encoding: UTF-8
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
@@ -220,9 +221,25 @@
@trans.read(@trans.available).should == "\"-Infinity\""
end
- it "should write string" do
- @prot.write_string("this is a test string")
- @trans.read(@trans.available).should == "\"this is a test string\""
+ if RUBY_VERSION >= '1.9'
+ it 'should write string' do
+ @prot.write_string('this is a test string')
+ a = @trans.read(@trans.available)
+ a.should == '"this is a test string"'.force_encoding(Encoding::BINARY)
+ a.encoding.should == Encoding::BINARY
+ end
+
+ it 'should write string with unicode characters' do
+ @prot.write_string("this is a test string with unicode characters: \u20AC \u20AD")
+ a = @trans.read(@trans.available)
+ a.should == "\"this is a test string with unicode characters: \u20AC \u20AD\"".force_encoding(Encoding::BINARY)
+ a.encoding.should == Encoding::BINARY
+ end
+ else
+ it 'should write string' do
+ @prot.write_string('this is a test string')
+ @trans.read(@trans.available).should == '"this is a test string"'
+ end
end
it "should write binary" do
@@ -461,9 +478,25 @@
@prot.read_double.should == 12.23
end
- it "should read string" do
- @trans.write("\"this is a test string\"")
- @prot.read_string.should == "this is a test string"
+ if RUBY_VERSION >= '1.9'
+ it 'should read string' do
+ @trans.write('"this is a test string"'.force_encoding(Encoding::BINARY))
+ a = @prot.read_string
+ a.should == 'this is a test string'
+ a.encoding.should == Encoding::UTF_8
+ end
+
+ it 'should read string with unicode characters' do
+ @trans.write('"this is a test string with unicode characters: \u20AC \u20AD"'.force_encoding(Encoding::BINARY))
+ a = @prot.read_string
+ a.should == "this is a test string with unicode characters: \u20AC \u20AD"
+ a.encoding.should == Encoding::UTF_8
+ end
+ else
+ it 'should read string' do
+ @trans.write('"this is a test string"')
+ @prot.read_string.should == 'this is a test string'
+ end
end
it "should read binary" do