blob: 3ccc6e32bd336f0d204aa2bba3b9a58e28d45240 [file] [log] [blame]
Roger Meier41ad4342015-03-24 22:30:40 +01001#
2# Licensed to the Apache Software Foundation (ASF) under one
3# or more contributor license agreements. See the NOTICE file
4# distributed with this work for additional information
5# regarding copyright ownership. The ASF licenses this file
6# to you under the Apache License, Version 2.0 (the
7# "License"); you may not use this file except in compliance
8# with the License. You may obtain a copy of the License at
9#
10# http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing,
13# software distributed under the License is distributed on an
14# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15# KIND, either express or implied. See the License for the
16# specific language governing permissions and limitations
17# under the License.
18#
19
20import contextlib
21import multiprocessing
22import multiprocessing.managers
23import os
24import platform
25import random
Nobuaki Sukegawaa6ab1f52015-11-28 15:04:39 +090026import socket
Roger Meier41ad4342015-03-24 22:30:40 +010027import subprocess
Nobuaki Sukegawaa6ab1f52015-11-28 15:04:39 +090028import sys
Roger Meier41ad4342015-03-24 22:30:40 +010029import time
Roger Meier41ad4342015-03-24 22:30:40 +010030
Nobuaki Sukegawa2de27002015-11-22 01:13:48 +090031from .report import ExecReporter, SummaryReporter
James E. King III9bea32f2018-03-16 16:07:42 -040032from .test import TestEntry
33from .util import domain_socket_path
Roger Meier41ad4342015-03-24 22:30:40 +010034
Roger Meier41ad4342015-03-24 22:30:40 +010035RESULT_ERROR = 64
James E. King III9bea32f2018-03-16 16:07:42 -040036RESULT_TIMEOUT = 128
37SIGNONE = 0
38SIGKILL = 15
Roger Meier41ad4342015-03-24 22:30:40 +010039
Nobuaki Sukegawad479e232016-02-28 11:28:19 +090040# globals
41ports = None
42stop = None
43
Roger Meier41ad4342015-03-24 22:30:40 +010044
45class ExecutionContext(object):
James E. King III9bea32f2018-03-16 16:07:42 -040046 def __init__(self, cmd, cwd, env, stop_signal, is_server, report):
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +090047 self._log = multiprocessing.get_logger()
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +090048 self.cmd = cmd
49 self.cwd = cwd
50 self.env = env
James E. King III9bea32f2018-03-16 16:07:42 -040051 self.stop_signal = stop_signal
52 self.is_server = is_server
53 self.report = report
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +090054 self.expired = False
55 self.killed = False
Nobuaki Sukegawa59310f52016-02-18 01:41:46 +090056 self.proc = None
Roger Meier41ad4342015-03-24 22:30:40 +010057
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +090058 def _popen_args(self):
59 args = {
60 'cwd': self.cwd,
61 'env': self.env,
62 'stdout': self.report.out,
63 'stderr': subprocess.STDOUT,
64 }
65 # make sure child processes doesn't remain after killing
66 if platform.system() == 'Windows':
67 DETACHED_PROCESS = 0x00000008
68 args.update(creationflags=DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP)
69 else:
70 args.update(preexec_fn=os.setsid)
71 return args
Roger Meier41ad4342015-03-24 22:30:40 +010072
James E. King III9bea32f2018-03-16 16:07:42 -040073 def start(self):
Alexandre Detiste24df0a52025-01-16 00:39:20 +010074 joined = ' '.join(self.cmd)
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +090075 self._log.debug('COMMAND: %s', joined)
76 self._log.debug('WORKDIR: %s', self.cwd)
77 self._log.debug('LOGFILE: %s', self.report.logpath)
78 self.report.begin()
79 self.proc = subprocess.Popen(self.cmd, **self._popen_args())
James E. King III9bea32f2018-03-16 16:07:42 -040080 self._log.debug(' PID: %d', self.proc.pid)
81 self._log.debug(' PGID: %d', os.getpgid(self.proc.pid))
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +090082 return self._scoped()
Roger Meier41ad4342015-03-24 22:30:40 +010083
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +090084 @contextlib.contextmanager
85 def _scoped(self):
86 yield self
James E. King III9bea32f2018-03-16 16:07:42 -040087 if self.is_server:
88 # the server is supposed to run until we stop it
89 if self.returncode is not None:
90 self.report.died()
91 else:
92 if self.stop_signal != SIGNONE:
93 if self.sigwait(self.stop_signal):
94 self.report.end(self.returncode)
95 else:
96 self.report.killed()
97 else:
98 self.sigwait(SIGKILL)
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +090099 else:
James E. King III9bea32f2018-03-16 16:07:42 -0400100 # the client is supposed to exit normally
101 if self.returncode is not None:
102 self.report.end(self.returncode)
103 else:
104 self.sigwait(SIGKILL)
105 self.report.killed()
106 self._log.debug('[{0}] exited with return code {1}'.format(self.proc.pid, self.returncode))
Roger Meier41ad4342015-03-24 22:30:40 +0100107
James E. King III9bea32f2018-03-16 16:07:42 -0400108 # Send a signal to the process and then wait for it to end
109 # If the signal requested is SIGNONE, no signal is sent, and
110 # instead we just wait for the process to end; further if it
111 # does not end normally with SIGNONE, we mark it as expired.
112 # If the process fails to end and the signal is not SIGKILL,
113 # it re-runs with SIGKILL so that a real process kill occurs
114 # returns True if the process ended, False if it may not have
115 def sigwait(self, sig=SIGKILL, timeout=2):
116 try:
117 if sig != SIGNONE:
118 self._log.debug('[{0}] send signal {1}'.format(self.proc.pid, sig))
119 if sig == SIGKILL:
120 self.killed = True
121 try:
122 if platform.system() != 'Windows':
123 os.killpg(os.getpgid(self.proc.pid), sig)
124 else:
125 self.proc.send_signal(sig)
126 except Exception:
127 self._log.info('[{0}] Failed to kill process'.format(self.proc.pid), exc_info=sys.exc_info())
128 self._log.debug('[{0}] wait begin, timeout {1} sec(s)'.format(self.proc.pid, timeout))
129 self.proc.communicate(timeout=timeout)
130 self._log.debug('[{0}] process ended with return code {1}'.format(self.proc.pid, self.returncode))
131 self.report.end(self.returncode)
132 return True
133 except subprocess.TimeoutExpired:
134 self._log.info('[{0}] timeout waiting for process to end'.format(self.proc.pid))
135 if sig == SIGNONE:
136 self.expired = True
137 return False if sig == SIGKILL else self.sigwait(SIGKILL, 1)
138
139 # called on the client process to wait for it to end naturally
140 def wait(self, timeout):
141 self.sigwait(SIGNONE, timeout)
Roger Meier41ad4342015-03-24 22:30:40 +0100142
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900143 @property
144 def returncode(self):
145 return self.proc.returncode if self.proc else None
Roger Meier41ad4342015-03-24 22:30:40 +0100146
147
James E. King III9bea32f2018-03-16 16:07:42 -0400148def exec_context(port, logdir, test, prog, is_server):
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900149 report = ExecReporter(logdir, test, prog)
150 prog.build_command(port)
James E. King III9bea32f2018-03-16 16:07:42 -0400151 return ExecutionContext(prog.command, prog.workdir, prog.env, prog.stop_signal, is_server, report)
Roger Meier41ad4342015-03-24 22:30:40 +0100152
153
James E. King IIIf7d43ce2018-11-06 09:32:51 -0500154def run_test(testdir, logdir, test_dict, max_retry, async_mode=True):
Nobuaki Sukegawa59310f52016-02-18 01:41:46 +0900155 logger = multiprocessing.get_logger()
156
James E. King III9bea32f2018-03-16 16:07:42 -0400157 def ensure_socket_open(sv, port, test):
158 slept = 0.1
159 time.sleep(slept)
160 sleep_step = 0.1
Nobuaki Sukegawae9b32342016-02-27 03:44:02 +0900161 while True:
James E. King III9bea32f2018-03-16 16:07:42 -0400162 if slept > test.delay:
163 logger.warn('[{0}] slept for {1} seconds but server is not open'.format(sv.proc.pid, slept))
164 return False
165 if test.socket == 'domain':
166 if not os.path.exists(domain_socket_path(port)):
167 logger.debug('[{0}] domain(unix) socket not available yet. slept for {1} seconds so far'.format(sv.proc.pid, slept))
168 time.sleep(sleep_step)
169 slept += sleep_step
170 elif test.socket == 'abstract':
171 return True
172 else:
173 # Create sockets every iteration because refused sockets cannot be
174 # reused on some systems.
175 sock4 = socket.socket()
176 sock6 = socket.socket(family=socket.AF_INET6)
177 try:
178 if sock4.connect_ex(('127.0.0.1', port)) == 0 \
179 or sock6.connect_ex(('::1', port)) == 0:
180 return True
181 if sv.proc.poll() is not None:
182 logger.warn('[{0}] server process is exited'.format(sv.proc.pid))
183 return False
184 logger.debug('[{0}] socket not available yet. slept for {1} seconds so far'.format(sv.proc.pid, slept))
185 time.sleep(sleep_step)
186 slept += sleep_step
187 finally:
188 sock4.close()
189 sock6.close()
190 logger.debug('[{0}] server ready - waited for {1} seconds'.format(sv.proc.pid, slept))
191 return True
Nobuaki Sukegawa59310f52016-02-18 01:41:46 +0900192
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900193 try:
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900194 max_bind_retry = 3
195 retry_count = 0
196 bind_retry_count = 0
197 test = TestEntry(testdir, **test_dict)
198 while True:
199 if stop.is_set():
200 logger.debug('Skipping because shutting down')
201 return (retry_count, None)
202 logger.debug('Start')
203 with PortAllocator.alloc_port_scoped(ports, test.socket) as port:
204 logger.debug('Start with port %d' % port)
James E. King III9bea32f2018-03-16 16:07:42 -0400205 sv = exec_context(port, logdir, test, test.server, True)
206 cl = exec_context(port, logdir, test, test.client, False)
Roger Meier41ad4342015-03-24 22:30:40 +0100207
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900208 logger.debug('Starting server')
209 with sv.start():
James E. King III9bea32f2018-03-16 16:07:42 -0400210 port_ok = ensure_socket_open(sv, port, test)
Nobuaki Sukegawa103974e2016-02-28 10:07:30 +0900211 if port_ok:
212 connect_retry_count = 0
James E. King III9bea32f2018-03-16 16:07:42 -0400213 max_connect_retry = 12
214 connect_retry_wait = 0.25
Nobuaki Sukegawa103974e2016-02-28 10:07:30 +0900215 while True:
216 if sv.proc.poll() is not None:
217 logger.info('not starting client because server process is absent')
218 break
219 logger.debug('Starting client')
James E. King III9bea32f2018-03-16 16:07:42 -0400220 cl.start()
221 logger.debug('Waiting client (up to %d secs)' % test.timeout)
222 cl.wait(test.timeout)
Nobuaki Sukegawa103974e2016-02-28 10:07:30 +0900223 if not cl.report.maybe_false_positive() or connect_retry_count >= max_connect_retry:
224 if connect_retry_count > 0 and connect_retry_count < max_connect_retry:
James E. King III9bea32f2018-03-16 16:07:42 -0400225 logger.info('[%s]: Connected after %d retry (%.2f sec each)' % (test.server.name, connect_retry_count, connect_retry_wait))
Nobuaki Sukegawa103974e2016-02-28 10:07:30 +0900226 # Wait for 50ms to see if server does not die at the end.
227 time.sleep(0.05)
228 break
229 logger.debug('Server may not be ready, waiting %.2f second...' % connect_retry_wait)
230 time.sleep(connect_retry_wait)
231 connect_retry_count += 1
Roger Meier41ad4342015-03-24 22:30:40 +0100232
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900233 if sv.report.maybe_false_positive() and bind_retry_count < max_bind_retry:
234 logger.warn('[%s]: Detected socket bind failure, retrying...', test.server.name)
235 bind_retry_count += 1
236 else:
James E. King III6f8c99e2018-03-24 16:32:02 -0400237 result = RESULT_TIMEOUT if cl.expired else cl.returncode if (cl.proc and cl.proc.poll()) is not None else RESULT_ERROR
James E. King III9bea32f2018-03-16 16:07:42 -0400238
239 # For servers that handle a controlled shutdown by signal
240 # if they are killed, or return an error code, that is a
241 # problem. For servers that are not signal-aware, we simply
242 # kill them off; if we didn't kill them off, something else
243 # happened (crashed?)
244 if test.server.stop_signal != 0:
Jiayu Liu49b2d6b2022-04-06 16:49:09 +0800245 # for bash scripts, 128+N is the exit code for signal N, since we are sending
246 # DEFAULT_SIGNAL=1, 128 + 1 is the expected err code
247 # http://www.gnu.org/software/bash/manual/html_node/Exit-Status.html
248 allowed_return_code = set([-1, 0, 128 + 1])
249 if sv.killed or sv.returncode not in allowed_return_code:
James E. King III9bea32f2018-03-16 16:07:42 -0400250 result |= RESULT_ERROR
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900251 else:
Nobuaki Sukegawa59310f52016-02-18 01:41:46 +0900252 if not sv.killed:
Nobuaki Sukegawa59310f52016-02-18 01:41:46 +0900253 result |= RESULT_ERROR
Nobuaki Sukegawa5e432ec2016-01-24 01:34:29 +0900254
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900255 if result == 0 or retry_count >= max_retry:
256 return (retry_count, result)
257 else:
258 logger.info('[%s-%s]: test failed, retrying...', test.server.name, test.client.name)
259 retry_count += 1
Nobuaki Sukegawa59310f52016-02-18 01:41:46 +0900260 except Exception:
James E. King IIIf7d43ce2018-11-06 09:32:51 -0500261 if not async_mode:
Nobuaki Sukegawa59310f52016-02-18 01:41:46 +0900262 raise
263 logger.warn('Error executing [%s]', test.name, exc_info=True)
264 return (retry_count, RESULT_ERROR)
James E. King III3ec40312019-01-31 18:35:51 -0500265 except Exception:
Nobuaki Sukegawa59310f52016-02-18 01:41:46 +0900266 logger.info('Interrupted execution', exc_info=True)
James E. King IIIf7d43ce2018-11-06 09:32:51 -0500267 if not async_mode:
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900268 raise
269 stop.set()
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900270 return (retry_count, RESULT_ERROR)
Roger Meier41ad4342015-03-24 22:30:40 +0100271
272
273class PortAllocator(object):
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900274 def __init__(self):
275 self._log = multiprocessing.get_logger()
276 self._lock = multiprocessing.Lock()
277 self._ports = set()
278 self._dom_ports = set()
279 self._last_alloc = 0
Roger Meier41ad4342015-03-24 22:30:40 +0100280
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900281 def _get_tcp_port(self):
282 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
Nobuaki Sukegawa59310f52016-02-18 01:41:46 +0900283 sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
284 sock.bind(('', 0))
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900285 port = sock.getsockname()[1]
286 self._lock.acquire()
287 try:
288 ok = port not in self._ports
289 if ok:
290 self._ports.add(port)
291 self._last_alloc = time.time()
292 finally:
293 self._lock.release()
294 sock.close()
295 return port if ok else self._get_tcp_port()
Roger Meier41ad4342015-03-24 22:30:40 +0100296
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900297 def _get_domain_port(self):
298 port = random.randint(1024, 65536)
299 self._lock.acquire()
300 try:
301 ok = port not in self._dom_ports
302 if ok:
303 self._dom_ports.add(port)
304 finally:
305 self._lock.release()
306 return port if ok else self._get_domain_port()
Roger Meier41ad4342015-03-24 22:30:40 +0100307
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900308 def alloc_port(self, socket_type):
309 if socket_type in ('domain', 'abstract'):
310 return self._get_domain_port()
311 else:
312 return self._get_tcp_port()
Roger Meier41ad4342015-03-24 22:30:40 +0100313
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900314 # static method for inter-process invokation
315 @staticmethod
316 @contextlib.contextmanager
317 def alloc_port_scoped(allocator, socket_type):
318 port = allocator.alloc_port(socket_type)
319 yield port
320 allocator.free_port(socket_type, port)
Roger Meier41ad4342015-03-24 22:30:40 +0100321
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900322 def free_port(self, socket_type, port):
323 self._log.debug('free_port')
324 self._lock.acquire()
325 try:
326 if socket_type == 'domain':
327 self._dom_ports.remove(port)
328 path = domain_socket_path(port)
329 if os.path.exists(path):
330 os.remove(path)
331 elif socket_type == 'abstract':
332 self._dom_ports.remove(port)
333 else:
334 self._ports.remove(port)
335 except IOError:
336 self._log.info('Error while freeing port', exc_info=sys.exc_info())
337 finally:
338 self._lock.release()
Roger Meier41ad4342015-03-24 22:30:40 +0100339
340
341class NonAsyncResult(object):
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900342 def __init__(self, value):
343 self._value = value
Roger Meier41ad4342015-03-24 22:30:40 +0100344
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900345 def get(self, timeout=None):
346 return self._value
Roger Meier41ad4342015-03-24 22:30:40 +0100347
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900348 def wait(self, timeout=None):
349 pass
Roger Meier41ad4342015-03-24 22:30:40 +0100350
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900351 def ready(self):
352 return True
Roger Meier41ad4342015-03-24 22:30:40 +0100353
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900354 def successful(self):
355 return self._value == 0
Roger Meier41ad4342015-03-24 22:30:40 +0100356
357
358class TestDispatcher(object):
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900359 def __init__(self, testdir, basedir, logdir_rel, concurrency):
360 self._log = multiprocessing.get_logger()
361 self.testdir = testdir
362 self._report = SummaryReporter(basedir, logdir_rel, concurrency > 1)
363 self.logdir = self._report.testdir
364 # seems needed for python 2.x to handle keyboard interrupt
365 self._stop = multiprocessing.Event()
366 self._async = concurrency > 1
367 if not self._async:
368 self._pool = None
369 global stop
370 global ports
371 stop = self._stop
372 ports = PortAllocator()
373 else:
374 self._m = multiprocessing.managers.BaseManager()
375 self._m.register('ports', PortAllocator)
376 self._m.start()
377 self._pool = multiprocessing.Pool(concurrency, self._pool_init, (self._m.address,))
378 self._log.debug(
379 'TestDispatcher started with %d concurrent jobs' % concurrency)
Roger Meier41ad4342015-03-24 22:30:40 +0100380
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900381 def _pool_init(self, address):
382 global stop
383 global m
384 global ports
385 stop = self._stop
386 m = multiprocessing.managers.BaseManager(address)
387 m.connect()
388 ports = m.ports()
Roger Meier41ad4342015-03-24 22:30:40 +0100389
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900390 def _dispatch_sync(self, test, cont, max_retry):
James E. King IIIf7d43ce2018-11-06 09:32:51 -0500391 r = run_test(self.testdir, self.logdir, test, max_retry, async_mode=False)
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900392 cont(r)
393 return NonAsyncResult(r)
Roger Meier41ad4342015-03-24 22:30:40 +0100394
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900395 def _dispatch_async(self, test, cont, max_retry):
396 self._log.debug('_dispatch_async')
397 return self._pool.apply_async(func=run_test, args=(self.testdir, self.logdir, test, max_retry), callback=cont)
Roger Meier41ad4342015-03-24 22:30:40 +0100398
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900399 def dispatch(self, test, max_retry):
400 index = self._report.add_test(test)
Roger Meier41ad4342015-03-24 22:30:40 +0100401
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900402 def cont(result):
403 if not self._stop.is_set():
Nobuaki Sukegawa59310f52016-02-18 01:41:46 +0900404 if result and len(result) == 2:
405 retry_count, returncode = result
406 else:
407 retry_count = 0
408 returncode = RESULT_ERROR
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900409 self._log.debug('freeing port')
410 self._log.debug('adding result')
411 self._report.add_result(index, returncode, returncode == RESULT_TIMEOUT, retry_count)
412 self._log.debug('finish continuation')
413 fn = self._dispatch_async if self._async else self._dispatch_sync
414 return fn(test, cont, max_retry)
Roger Meier41ad4342015-03-24 22:30:40 +0100415
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900416 def wait(self):
417 if self._async:
418 self._pool.close()
419 self._pool.join()
420 self._m.shutdown()
421 return self._report.end()
Roger Meier41ad4342015-03-24 22:30:40 +0100422
Nobuaki Sukegawa10308cb2016-02-03 01:57:03 +0900423 def terminate(self):
424 self._stop.set()
425 if self._async:
426 self._pool.terminate()
427 self._pool.join()
428 self._m.shutdown()