THRIFT-2463 test/py/RunClientServer.py fails sometimes
Client: Python
Patch: Benoit Sigoure
diff --git a/test/py/RunClientServer.py b/test/py/RunClientServer.py
index db0bfa4..782bd83 100755
--- a/test/py/RunClientServer.py
+++ b/test/py/RunClientServer.py
@@ -21,6 +21,7 @@
 
 from __future__ import division
 import time
+import socket
 import subprocess
 import sys
 import os
@@ -133,7 +134,30 @@
   if options.verbose > 0:
     print 'Testing server %s: %s' % (server_class, ' '.join(server_args))
   serverproc = subprocess.Popen(server_args)
-  time.sleep(0.15)
+
+  def ensureServerAlive():
+    if serverproc.poll() is not None:
+      print ('FAIL: Server process (%s) failed with retcode %d'
+             % (' '.join(server_args), serverproc.returncode))
+      raise Exception('Server subprocess %s died, args: %s'
+                      % (server_class, ' '.join(server_args)))
+
+  # Wait for the server to start accepting connections on the given port.
+  sock = socket.socket()
+  sleep_time = 0.1  # Seconds
+  max_attempts = 100
+  try:
+    attempt = 0
+    while sock.connect_ex(('127.0.0.1', port)) != 0:
+      attempt += 1
+      if attempt >= max_attempts:
+        raise Exception("TestServer not ready on port %d after %.2f seconds"
+                        % (port, sleep_time * attempt))
+      ensureServerAlive()
+      time.sleep(sleep_time)
+  finally:
+    sock.close()
+
   try:
     if options.verbose > 0:
       print 'Testing client: %s' % (' '.join(cli_args))
@@ -142,19 +166,15 @@
       raise Exception("Client subprocess failed, retcode=%d, args: %s" % (ret, ' '.join(cli_args)))
   finally:
     # check that server didn't die
-    serverproc.poll()
-    if serverproc.returncode is not None:
-      print 'FAIL: Server process (%s) failed with retcode %d' % (' '.join(server_args), serverproc.returncode)
-      raise Exception('Server subprocess %s died, args: %s' % (server_class, ' '.join(server_args)))
-    else:
-      extra_sleep = EXTRA_DELAY.get(server_class, 0)
-      if extra_sleep > 0 and options.verbose > 0:
-        print 'Giving %s (proto=%s,zlib=%s,ssl=%s) an extra %d seconds for child processes to terminate via alarm' % (server_class,
-              proto, use_zlib, use_ssl, extra_sleep)
-        time.sleep(extra_sleep)
-      os.kill(serverproc.pid, signal.SIGKILL)
-  # wait for shutdown
-  time.sleep(0.05)
+    ensureServerAlive()
+    extra_sleep = EXTRA_DELAY.get(server_class, 0)
+    if extra_sleep > 0 and options.verbose > 0:
+      print ('Giving %s (proto=%s,zlib=%s,ssl=%s) an extra %d seconds for child'
+             'processes to terminate via alarm'
+             % (server_class, proto, use_zlib, use_ssl, extra_sleep))
+      time.sleep(extra_sleep)
+    os.kill(serverproc.pid, signal.SIGKILL)
+    serverproc.wait()
 
 test_count = 0
 # run tests without a client/server first