replace "utf8" alias by canonical "utf-8" locale code + remove the second Python2 vs Py3 compat.py
Client: py
Patch: Alexandre Detiste

This closes #3105
diff --git a/lib/py/src/compat.py b/lib/py/src/compat.py
deleted file mode 100644
index 3b3d57f..0000000
--- a/lib/py/src/compat.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-from io import BytesIO as BufferIO  # noqa
-
-def binary_to_str(bin_val):
-    return bin_val.decode('utf8')
-
-def str_to_binary(str_val):
-    return bytes(str_val, 'utf8')
-
-def byte_index(bytes_val, i):
-    return bytes_val[i]
diff --git a/lib/py/src/protocol/TBinaryProtocol.py b/lib/py/src/protocol/TBinaryProtocol.py
index e59e0dc..af64ec1 100644
--- a/lib/py/src/protocol/TBinaryProtocol.py
+++ b/lib/py/src/protocol/TBinaryProtocol.py
@@ -17,10 +17,10 @@
 # under the License.
 #
 
-from .TProtocol import TType, TProtocolBase, TProtocolException, TProtocolFactory
-from ..compat import binary_to_str
 from struct import pack, unpack
 
+from .TProtocol import TType, TProtocolBase, TProtocolException, TProtocolFactory
+
 
 class TBinaryProtocol(TProtocolBase):
     """Binary implementation of the Thrift protocol driver."""
@@ -146,7 +146,7 @@
             if self.strictRead:
                 raise TProtocolException(type=TProtocolException.BAD_VERSION,
                                          message='No protocol version header')
-            name = binary_to_str(self.trans.readAll(sz))
+            name = self.trans.readAll(sz).decode('utf-8')
             type = self.readByte()
             seqid = self.readI32()
         return (name, type, seqid)
diff --git a/lib/py/src/protocol/TCompactProtocol.py b/lib/py/src/protocol/TCompactProtocol.py
index 700e792..a3527cd 100644
--- a/lib/py/src/protocol/TCompactProtocol.py
+++ b/lib/py/src/protocol/TCompactProtocol.py
@@ -20,8 +20,6 @@
 from .TProtocol import TType, TProtocolBase, TProtocolException, TProtocolFactory, checkIntegerLimits
 from struct import pack, unpack
 
-from ..compat import binary_to_str, str_to_binary
-
 __all__ = ['TCompactProtocol', 'TCompactProtocolFactory']
 
 CLEAR = 0
@@ -165,7 +163,7 @@
         if tseqid < 0:
             tseqid = 2147483648 + (2147483648 + tseqid)
         self.__writeVarint(tseqid)
-        self.__writeBinary(str_to_binary(name))
+        self.__writeBinary(bytes(name, 'utf-8'))
         self.state = VALUE_WRITE
 
     def writeMessageEnd(self):
@@ -346,7 +344,7 @@
         # however the sequence is actually signed...
         if seqid > 2147483647:
             seqid = -2147483648 - (2147483648 - seqid)
-        name = binary_to_str(self.__readBinary())
+        name = self.__readBinary().decode('utf-8')
         return (name, type, seqid)
 
     def readMessageEnd(self):
diff --git a/lib/py/src/protocol/TJSONProtocol.py b/lib/py/src/protocol/TJSONProtocol.py
index fef0cc9..a42aaa6 100644
--- a/lib/py/src/protocol/TJSONProtocol.py
+++ b/lib/py/src/protocol/TJSONProtocol.py
@@ -23,8 +23,6 @@
 import math
 import sys
 
-from ..compat import str_to_binary
-
 
 __all__ = ['TJSONProtocol',
            'TJSONProtocolFactory',
@@ -213,7 +211,7 @@
             escaped = ESCAPE_CHAR_VALS.get(s, s)
             json_str.append(escaped)
         json_str.append('"')
-        self.trans.write(str_to_binary(''.join(json_str)))
+        self.trans.write(bytes(''.join(json_str), 'utf-8'))
 
     def writeJSONNumber(self, number, formatter='{0}'):
         self.context.write()
@@ -313,7 +311,7 @@
                 utf8_bytes = bytearray([ord(character)])
                 while ord(self.reader.peek()) >= 0x80:
                     utf8_bytes.append(ord(self.reader.read()))
-                character = utf8_bytes.decode('utf8')
+                character = utf8_bytes.decode('utf-8')
             string.append(character)
 
             if highSurrogate:
diff --git a/lib/py/src/protocol/TProtocol.py b/lib/py/src/protocol/TProtocol.py
index ec71ab3..a7336c5 100644
--- a/lib/py/src/protocol/TProtocol.py
+++ b/lib/py/src/protocol/TProtocol.py
@@ -19,7 +19,6 @@
 
 from thrift.Thrift import TException, TType, TFrozenDict
 from thrift.transport.TTransport import TTransportException
-from ..compat import binary_to_str, str_to_binary
 
 import sys
 from itertools import islice
@@ -117,13 +116,13 @@
         pass
 
     def writeString(self, str_val):
-        self.writeBinary(str_to_binary(str_val))
+        self.writeBinary(bytes(str_val, 'utf-8'))
 
     def writeBinary(self, str_val):
         pass
 
     def writeUtf8(self, str_val):
-        self.writeString(str_val.encode('utf8'))
+        self.writeString(str_val.encode('utf-8'))
 
     def readMessageBegin(self):
         pass
@@ -180,13 +179,13 @@
         pass
 
     def readString(self):
-        return binary_to_str(self.readBinary())
+        return self.readBinary().decode('utf-8')
 
     def readBinary(self):
         pass
 
     def readUtf8(self):
-        return self.readString().decode('utf8')
+        return self.readString().decode('utf-8')
 
     def skip(self, ttype):
         if ttype == TType.BOOL:
diff --git a/lib/py/src/transport/THeaderTransport.py b/lib/py/src/transport/THeaderTransport.py
index 7c9827b..4fb2034 100644
--- a/lib/py/src/transport/THeaderTransport.py
+++ b/lib/py/src/transport/THeaderTransport.py
@@ -19,8 +19,8 @@
 
 import struct
 import zlib
+from io import BytesIO
 
-from thrift.compat import BufferIO, byte_index
 from thrift.protocol.TBinaryProtocol import TBinaryProtocol
 from thrift.protocol.TCompactProtocol import TCompactProtocol, readVarint, writeVarint
 from thrift.Thrift import TApplicationException
@@ -31,7 +31,6 @@
     TTransportException,
 )
 
-
 U16 = struct.Struct("!H")
 I32 = struct.Struct("!i")
 HEADER_MAGIC = 0x0FFF
@@ -92,10 +91,10 @@
         self._client_type = THeaderClientType.HEADERS
         self._allowed_client_types = allowed_client_types
 
-        self._read_buffer = BufferIO(b"")
+        self._read_buffer = BytesIO(b"")
         self._read_headers = {}
 
-        self._write_buffer = BufferIO()
+        self._write_buffer = BytesIO()
         self._write_headers = {}
         self._write_transforms = []
 
@@ -184,8 +183,8 @@
         if frame_size & TBinaryProtocol.VERSION_MASK == TBinaryProtocol.VERSION_1:
             self._set_client_type(THeaderClientType.UNFRAMED_BINARY)
             is_unframed = True
-        elif (byte_index(first_word, 0) == TCompactProtocol.PROTOCOL_ID and
-              byte_index(first_word, 1) & TCompactProtocol.VERSION_MASK == TCompactProtocol.VERSION):
+        elif (first_word[0] == TCompactProtocol.PROTOCOL_ID and
+              first_word[1] & TCompactProtocol.VERSION_MASK == TCompactProtocol.VERSION):
             self._set_client_type(THeaderClientType.UNFRAMED_COMPACT)
             is_unframed = True
 
@@ -195,7 +194,7 @@
                 rest = self._transport.read(bytes_left_to_read)
             else:
                 rest = b""
-            self._read_buffer = BufferIO(first_word + rest)
+            self._read_buffer = BytesIO(first_word + rest)
             return
 
         # ok, we're still here so we're framed.
@@ -204,7 +203,7 @@
                 TTransportException.SIZE_LIMIT,
                 "Frame was too large.",
             )
-        read_buffer = BufferIO(self._transport.readAll(frame_size))
+        read_buffer = BytesIO(self._transport.readAll(frame_size))
 
         # the next word is either going to be the version field of a
         # binary/compact protocol message or the magic value + flags of a
@@ -218,8 +217,8 @@
         elif version & TBinaryProtocol.VERSION_MASK == TBinaryProtocol.VERSION_1:
             self._set_client_type(THeaderClientType.FRAMED_BINARY)
             self._read_buffer = read_buffer
-        elif (byte_index(second_word, 0) == TCompactProtocol.PROTOCOL_ID and
-              byte_index(second_word, 1) & TCompactProtocol.VERSION_MASK == TCompactProtocol.VERSION):
+        elif (second_word[0] == TCompactProtocol.PROTOCOL_ID and
+              second_word[1] & TCompactProtocol.VERSION_MASK == TCompactProtocol.VERSION):
             self._set_client_type(THeaderClientType.FRAMED_COMPACT)
             self._read_buffer = read_buffer
         else:
@@ -229,7 +228,7 @@
             )
 
     def _parse_header_format(self, buffer):
-        # make BufferIO look like TTransport for varint helpers
+        # make BytesIO look like TTransport for varint helpers
         buffer_transport = TMemoryBuffer()
         buffer_transport._buffer = buffer
 
@@ -279,22 +278,22 @@
         for transform_id in transforms:
             transform_fn = READ_TRANSFORMS_BY_ID[transform_id]
             payload = transform_fn(payload)
-        return BufferIO(payload)
+        return BytesIO(payload)
 
     def write(self, buf):
         self._write_buffer.write(buf)
 
     def flush(self):
         payload = self._write_buffer.getvalue()
-        self._write_buffer = BufferIO()
+        self._write_buffer = BytesIO()
 
-        buffer = BufferIO()
+        buffer = BytesIO()
         if self._client_type == THeaderClientType.HEADERS:
             for transform_id in self._write_transforms:
                 transform_fn = WRITE_TRANSFORMS_BY_ID[transform_id]
                 payload = transform_fn(payload)
 
-            headers = BufferIO()
+            headers = BytesIO()
             writeVarint(headers, self._protocol_id)
             writeVarint(headers, len(self._write_transforms))
             for transform_id in self._write_transforms:
@@ -348,5 +347,5 @@
         result = bytearray(partialread)
         while len(result) < reqlen:
             result += self.read(reqlen - len(result))
-        self._read_buffer = BufferIO(result)
+        self._read_buffer = BytesIO(result)
         return self._read_buffer
diff --git a/lib/py/src/transport/TTransport.py b/lib/py/src/transport/TTransport.py
index a686b12..4f6b67f 100644
--- a/lib/py/src/transport/TTransport.py
+++ b/lib/py/src/transport/TTransport.py
@@ -17,9 +17,9 @@
 # under the License.
 #
 
-from io import BytesIO as BufferIO
-
+from io import BytesIO
 from struct import pack, unpack
+
 from thrift.Thrift import TException
 
 
@@ -144,9 +144,9 @@
 
     def __init__(self, trans, rbuf_size=DEFAULT_BUFFER):
         self.__trans = trans
-        self.__wbuf = BufferIO()
+        self.__wbuf = BytesIO()
         # Pass string argument to initialize read buffer as cStringIO.InputType
-        self.__rbuf = BufferIO(b'')
+        self.__rbuf = BytesIO(b'')
         self.__rbuf_size = rbuf_size
 
     def isOpen(self):
@@ -162,7 +162,7 @@
         ret = self.__rbuf.read(sz)
         if len(ret) != 0:
             return ret
-        self.__rbuf = BufferIO(self.__trans.read(max(sz, self.__rbuf_size)))
+        self.__rbuf = BytesIO(self.__trans.read(max(sz, self.__rbuf_size)))
         return self.__rbuf.read(sz)
 
     def write(self, buf):
@@ -170,13 +170,13 @@
             self.__wbuf.write(buf)
         except Exception as e:
             # on exception reset wbuf so it doesn't contain a partial function call
-            self.__wbuf = BufferIO()
+            self.__wbuf = BytesIO()
             raise e
 
     def flush(self):
         out = self.__wbuf.getvalue()
         # reset wbuf before write/flush to preserve state on underlying failure
-        self.__wbuf = BufferIO()
+        self.__wbuf = BytesIO()
         self.__trans.write(out)
         self.__trans.flush()
 
@@ -195,7 +195,7 @@
         if len(retstring) < reqlen:
             retstring += self.__trans.readAll(reqlen - len(retstring))
 
-        self.__rbuf = BufferIO(retstring)
+        self.__rbuf = BytesIO(retstring)
         return self.__rbuf
 
 
@@ -214,9 +214,9 @@
         If value is set, this will be a transport for reading,
         otherwise, it is for writing"""
         if value is not None:
-            self._buffer = BufferIO(value)
+            self._buffer = BytesIO(value)
         else:
-            self._buffer = BufferIO()
+            self._buffer = BytesIO()
         if offset:
             self._buffer.seek(offset)
 
@@ -264,8 +264,8 @@
 
     def __init__(self, trans,):
         self.__trans = trans
-        self.__rbuf = BufferIO(b'')
-        self.__wbuf = BufferIO()
+        self.__rbuf = BytesIO(b'')
+        self.__wbuf = BytesIO()
 
     def isOpen(self):
         return self.__trans.isOpen()
@@ -287,7 +287,7 @@
     def readFrame(self):
         buff = self.__trans.readAll(4)
         sz, = unpack('!i', buff)
-        self.__rbuf = BufferIO(self.__trans.readAll(sz))
+        self.__rbuf = BytesIO(self.__trans.readAll(sz))
 
     def write(self, buf):
         self.__wbuf.write(buf)
@@ -296,7 +296,7 @@
         wout = self.__wbuf.getvalue()
         wsz = len(wout)
         # reset wbuf before write/flush to preserve state on underlying failure
-        self.__wbuf = BufferIO()
+        self.__wbuf = BytesIO()
         # N.B.: Doing this string concatenation is WAY cheaper than making
         # two separate calls to the underlying socket object. Socket writes in
         # Python turn out to be REALLY expensive, but it seems to do a pretty
@@ -317,7 +317,7 @@
         while len(prefix) < reqlen:
             self.readFrame()
             prefix += self.__rbuf.getvalue()
-        self.__rbuf = BufferIO(prefix)
+        self.__rbuf = BytesIO(prefix)
         return self.__rbuf
 
 
@@ -371,8 +371,8 @@
         self.transport = transport
         self.sasl = SASLClient(host, service, mechanism, **sasl_kwargs)
 
-        self.__wbuf = BufferIO()
-        self.__rbuf = BufferIO(b'')
+        self.__wbuf = BytesIO()
+        self.__rbuf = BytesIO(b'')
 
     def open(self):
         if not self.transport.isOpen():
@@ -424,7 +424,7 @@
         encoded = self.sasl.wrap(data)
         self.transport.write(pack("!i", len(encoded)) + encoded)
         self.transport.flush()
-        self.__wbuf = BufferIO()
+        self.__wbuf = BytesIO()
 
     def read(self, sz):
         ret = self.__rbuf.read(sz)
@@ -438,7 +438,7 @@
         header = self.transport.readAll(4)
         length, = unpack('!i', header)
         encoded = self.transport.readAll(length)
-        self.__rbuf = BufferIO(self.sasl.unwrap(encoded))
+        self.__rbuf = BytesIO(self.sasl.unwrap(encoded))
 
     def close(self):
         self.sasl.dispose()
@@ -456,5 +456,5 @@
         while len(prefix) < reqlen:
             self._read_frame()
             prefix += self.__rbuf.getvalue()
-        self.__rbuf = BufferIO(prefix)
+        self.__rbuf = BytesIO(prefix)
         return self.__rbuf
diff --git a/lib/py/src/transport/TZlibTransport.py b/lib/py/src/transport/TZlibTransport.py
index 8b08297..a476d2a 100644
--- a/lib/py/src/transport/TZlibTransport.py
+++ b/lib/py/src/transport/TZlibTransport.py
@@ -23,8 +23,9 @@
 """
 
 import zlib
+from io import BytesIO
+
 from .TTransport import TTransportBase, CReadableTransport
-from ..compat import BufferIO
 
 
 class TZlibTransportFactory:
@@ -87,8 +88,8 @@
         """
         self.__trans = trans
         self.compresslevel = compresslevel
-        self.__rbuf = BufferIO()
-        self.__wbuf = BufferIO()
+        self.__rbuf = BytesIO()
+        self.__wbuf = BytesIO()
         self._init_zlib()
         self._init_stats()
 
@@ -96,8 +97,8 @@
         """Internal method to initialize/reset the internal StringIO objects
         for read and write buffers.
         """
-        self.__rbuf = BufferIO()
-        self.__wbuf = BufferIO()
+        self.__rbuf = BytesIO()
+        self.__wbuf = BytesIO()
 
     def _init_stats(self):
         """Internal method to reset the internal statistics counters
@@ -202,7 +203,7 @@
         self.bytes_in += len(zbuf)
         self.bytes_in_comp += len(buf)
         old = self.__rbuf.read()
-        self.__rbuf = BufferIO(old + buf)
+        self.__rbuf = BytesIO(old + buf)
         if len(old) + len(buf) == 0:
             return False
         return True
@@ -227,7 +228,7 @@
         ztail = self._zcomp_write.flush(zlib.Z_SYNC_FLUSH)
         self.bytes_out_comp += len(ztail)
         if (len(zbuf) + len(ztail)) > 0:
-            self.__wbuf = BufferIO()
+            self.__wbuf = BytesIO()
             self.__trans.write(zbuf + ztail)
         self.__trans.flush()
 
@@ -243,5 +244,5 @@
             retstring += self.read(self.DEFAULT_BUFFSIZE)
         while len(retstring) < reqlen:
             retstring += self.read(reqlen - len(retstring))
-        self.__rbuf = BufferIO(retstring)
+        self.__rbuf = BytesIO(retstring)
         return self.__rbuf
diff --git a/lib/py/test/thrift_json.py b/lib/py/test/thrift_json.py
index 5a491e2..bf2b808 100644
--- a/lib/py/test/thrift_json.py
+++ b/lib/py/test/thrift_json.py
@@ -17,7 +17,6 @@
 # under the License.
 #
 
-import sys
 import unittest
 
 import _import_local_thrift  # noqa