THRIFT-3579 Introduce retry to make cross
This closes #817
diff --git a/test/crossrunner/run.py b/test/crossrunner/run.py
index 0d617c0..68bd928 100644
--- a/test/crossrunner/run.py
+++ b/test/crossrunner/run.py
@@ -60,12 +60,12 @@
if platform.system() != 'Windows':
try:
os.killpg(self.proc.pid, signal.SIGKILL)
- except Exception as err:
- self._log.info('Failed to kill process group : %s' % str(err))
+ except Exception:
+ self._log.info('Failed to kill process group', exc_info=sys.exc_info())
try:
self.proc.kill()
- except Exception as err:
- self._log.info('Failed to kill process : %s' % str(err))
+ except Exception:
+ self._log.info('Failed to kill process', exc_info=sys.exc_info())
def _popen_args(self):
args = {
@@ -122,15 +122,17 @@
return ExecutionContext(prog.command, prog.workdir, prog.env, report)
-def run_test(testdir, logdir, test_dict, async=True, max_retry=3):
+def run_test(testdir, logdir, test_dict, max_retry, async=True):
try:
logger = multiprocessing.get_logger()
+ max_bind_retry = 3
retry_count = 0
+ bind_retry_count = 0
test = TestEntry(testdir, **test_dict)
while True:
if stop.is_set():
logger.debug('Skipping because shutting down')
- return None
+ return (retry_count, None)
logger.debug('Start')
with PortAllocator.alloc_port_scoped(ports, test.socket) as port:
logger.debug('Start with port %d' % port)
@@ -142,35 +144,41 @@
if test.delay > 0:
logger.debug('Delaying client for %.2f seconds' % test.delay)
time.sleep(test.delay)
- cl_retry_count = 0
- cl_max_retry = 10
- cl_retry_wait = 0.5
+ connect_retry_count = 0
+ max_connect_retry = 10
+ connect_retry_wait = 0.5
while True:
logger.debug('Starting client')
cl.start(test.timeout)
logger.debug('Waiting client')
cl.wait()
- if not cl.report.maybe_false_positive() or cl_retry_count >= cl_max_retry:
- if cl_retry_count > 0 and cl_retry_count < cl_max_retry:
- logger.warn('[%s]: Connected after %d retry (%.2f sec each)' % (test.server.name, cl_retry_count, cl_retry_wait))
- # Wait for 50 ms to see if server does not die at the end.
+ if not cl.report.maybe_false_positive() or connect_retry_count >= max_connect_retry:
+ if connect_retry_count > 0 and connect_retry_count < max_connect_retry:
+ logger.warn('[%s]: Connected after %d retry (%.2f sec each)' % (test.server.name, connect_retry_count, connect_retry_wait))
+ # Wait for 50ms to see if server does not die at the end.
time.sleep(0.05)
break
- logger.debug('Server may not be ready, waiting %.2f second...' % cl_retry_wait)
- time.sleep(cl_retry_wait)
- cl_retry_count += 1
+ logger.debug('Server may not be ready, waiting %.2f second...' % connect_retry_wait)
+ time.sleep(connect_retry_wait)
+ connect_retry_count += 1
- if not sv.report.maybe_false_positive() or retry_count >= max_retry:
- logger.debug('Finish')
+ if sv.report.maybe_false_positive() and bind_retry_count < max_bind_retry:
+ logger.warn('[%s]: Detected socket bind failure, retrying...', test.server.name)
+ bind_retry_count += 1
+ else:
if cl.expired:
- return RESULT_TIMEOUT
+ result = RESULT_TIMEOUT
elif not sv.killed and cl.proc.returncode == 0:
# Server should be alive at the end.
- return RESULT_ERROR
+ result = RESULT_ERROR
else:
- return cl.proc.returncode
- logger.warn('[%s]: Detected socket bind failure, retrying...' % test.server.name)
- retry_count += 1
+ result = cl.proc.returncode
+
+ if result == 0 or retry_count >= max_retry:
+ return (retry_count, result)
+ else:
+ logger.info('[%s-%s]: test failed, retrying...', test.server.name, test.client.name)
+ retry_count += 1
except (KeyboardInterrupt, SystemExit):
logger.info('Interrupted execution')
if not async:
@@ -181,7 +189,7 @@
if not async:
raise
logger.warn('Error executing [%s]', test.name, exc_info=sys.exc_info())
- return RESULT_ERROR
+ return (retry_count, RESULT_ERROR)
class PortAllocator(object):
@@ -245,8 +253,8 @@
self._dom_ports.remove(port)
else:
self._ports.remove(port)
- except IOError as err:
- self._log.info('Error while freeing port : %s' % str(err))
+ except IOError:
+ self._log.info('Error while freeing port', exc_info=sys.exc_info())
finally:
self._lock.release()
@@ -300,26 +308,27 @@
m.connect()
ports = m.ports()
- def _dispatch_sync(self, test, cont):
- r = run_test(self.testdir, self.logdir, test, False)
+ def _dispatch_sync(self, test, cont, max_retry):
+ r = run_test(self.testdir, self.logdir, test, max_retry, False)
cont(r)
return NonAsyncResult(r)
- def _dispatch_async(self, test, cont):
+ def _dispatch_async(self, test, cont, max_retry):
self._log.debug('_dispatch_async')
- return self._pool.apply_async(func=run_test, args=(self.testdir, self.logdir, test,), callback=cont)
+ return self._pool.apply_async(func=run_test, args=(self.testdir, self.logdir, test, max_retry), callback=cont)
- def dispatch(self, test):
+ def dispatch(self, test, max_retry):
index = self._report.add_test(test)
- def cont(r):
+ def cont(result):
if not self._stop.is_set():
+ retry_count, returncode = result
self._log.debug('freeing port')
self._log.debug('adding result')
- self._report.add_result(index, r, r == RESULT_TIMEOUT)
+ self._report.add_result(index, returncode, returncode == RESULT_TIMEOUT, retry_count)
self._log.debug('finish continuation')
fn = self._dispatch_async if self._async else self._dispatch_sync
- return fn(test, cont)
+ return fn(test, cont, max_retry)
def wait(self):
if self._async: