Blame - stress/driver.py - packaging/sources/tempest

blob: 9f263f62f15ff9a64478f6dda817e4727426591e [file] [log] [blame]

Sean Dague	7011236	2012-04-03 13:48:49 -0400	[diff] [blame^]	1	# Copyright 2011 Quanta Research Cambridge, Inc.
				2	#
				3	# Licensed under the Apache License, Version 2.0 (the "License");
				4	# you may not use this file except in compliance with the License.
				5	# You may obtain a copy of the License at
				6	#
				7	# http://www.apache.org/licenses/LICENSE-2.0
				8	#
				9	# Unless required by applicable law or agreed to in writing, software
				10	# distributed under the License is distributed on an "AS IS" BASIS,
				11	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	# See the License for the specific language governing permissions and
				13	# limitations under the License.
				14	"""The entry point for the execution of a workloadTo execute a workload.
				15	Users pass in a description of the workload and a nova manager object
				16	to the bash_openstack function call"""
				17
				18
				19	import random
				20	import datetime
				21	import time
				22
				23
				24	# local imports
				25	from test_case import *
				26	from state import State
				27	import utils.util
				28	from config import StressConfig
				29
				30	# setup logging to file
				31	logging.basicConfig(
				32	format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s',
				33	datefmt='%m-%d %H:%M:%S',
				34	filename="stress.debug.log",
				35	filemode="w",
				36	level=logging.DEBUG,
				37	)
				38
				39	# define a Handler which writes INFO messages or higher to the sys.stdout
				40	_console = logging.StreamHandler()
				41	_console.setLevel(logging.INFO)
				42	# set a format which is simpler for console use
				43	_formatter = logging.Formatter('%(name)-20s: %(levelname)-8s %(message)s')
				44	# tell the handler to use this format
				45	_console.setFormatter(_formatter)
				46	# add the handler to the root logger
				47	logging.getLogger('').addHandler(_console)
				48
				49
				50	def _create_cases(choice_spec):
				51	"""
				52	Generate a workload of tests from workload description
				53	"""
				54	cases = []
				55	count = 0
				56	for choice in choice_spec:
				57	p = choice.probability
				58	for i in range(p):
				59	cases.append(choice)
				60	i = i + p
				61	count = count + p
				62	assert(count == 100)
				63	return cases
				64
				65
				66	def _get_compute_nodes(keypath, user, controller):
				67	"""
				68	Returns a list of active compute nodes. List is generated by running
				69	nova-manage on the controller.
				70	"""
				71	nodes = []
				72	if keypath == None or user == None:
				73	return nodes
				74	lines = utils.util.ssh(keypath, user, controller,
				75	"nova-manage service list \| grep ^nova-compute").\
				76	split('\n')
				77	# For example: nova-compute xg11eth0 nova enabled :-) 2011-10-31 18:57:46
				78	# This is fragile but there is, at present, no other way to get this info.
				79	for line in lines:
				80	words = line.split()
				81	if len(words) > 0 and words[4] == ":-)":
				82	nodes.append(words[1])
				83	return nodes
				84
				85
				86	def _error_in_logs(keypath, logdir, user, nodes):
				87	"""
				88	Detect errors in the nova log files on the controller and compute nodes.
				89	"""
				90	grep = 'egrep "ERROR\\|TRACE" %s/*.log' % logdir
				91	for node in nodes:
				92	errors = utils.util.ssh(keypath, user, node, grep, check=False)
				93	if len(errors) > 0:
				94	logging.error('%s: %s' % (node, errors))
				95	return True
				96	return False
				97
				98
				99	def bash_openstack(manager,
				100	choice_spec,
				101	**kwargs):
				102	"""
				103	Workload driver. Executes a workload as specified by the `choice_spec`
				104	parameter against a nova-cluster.
				105
				106	`manager` : Manager object
				107	`choice_spec` : list of BasherChoice actions to run on the cluster
				108	`kargs` : keyword arguments to the constructor of `test_case`
				109	`duration` = how long this test should last (3 sec)
				110	`sleep_time` = time to sleep between actions (in msec)
				111	`test_name` = human readable workload description
				112	(default: unnamed test)
				113	`max_vms` = maximum number of instances to launch
				114	(default: 32)
				115	`seed` = random seed (default: None)
				116	"""
				117	stress_config = StressConfig(manager.config._conf)
				118	# get keyword arguments
				119	duration = kwargs.get('duration', datetime.timedelta(seconds=10))
				120	seed = kwargs.get('seed', None)
				121	sleep_time = float(kwargs.get('sleep_time', 3000)) / 1000
				122	max_vms = int(kwargs.get('max_vms', stress_config.max_instances))
				123	test_name = kwargs.get('test_name', 'unamed test')
				124
				125	keypath = stress_config.host_private_key_path
				126	user = stress_config.host_admin_user
				127	logdir = stress_config.nova_logdir
				128	computes = _get_compute_nodes(keypath, user, manager.config.identity.host)
				129	utils.util.execute_on_all(keypath, user, computes,
				130	"rm -f %s/*.log" % logdir)
				131	random.seed(seed)
				132	cases = _create_cases(choice_spec)
				133	test_end_time = time.time() + duration.seconds
				134	state = State(max_vms=max_vms)
				135
				136	retry_list = []
				137	last_retry = time.time()
				138	cooldown = False
				139	logcheck_count = 0
				140	test_succeeded = True
				141	logging.debug('=== Test \"%s\" on %s ===' %
				142	(test_name, time.asctime(time.localtime())))
				143	for kw in kwargs:
				144	logging.debug('\t%s = %s', kw, kwargs[kw])
				145
				146	while True:
				147	if not cooldown:
				148	if time.time() < test_end_time:
				149	case = random.choice(cases)
				150	logging.debug('Chose %s' % case)
				151	retry = case.invoke(manager, state)
				152	if retry != None:
				153	retry_list.append(retry)
				154	else:
				155	logging.info('Cooling down...')
				156	cooldown = True
				157	if cooldown and len(retry_list) == 0:
				158	if _error_in_logs(keypath, logdir, user, computes):
				159	test_succeeded = False
				160	break
				161	# Retry verifications every 5 seconds.
				162	if time.time() - last_retry > 5:
				163	logging.debug('retry verifications for %d tasks', len(retry_list))
				164	new_retry_list = []
				165	for v in retry_list:
				166	if not v.retry():
				167	new_retry_list.append(v)
				168	retry_list = new_retry_list
				169	last_retry = time.time()
				170	time.sleep(sleep_time)
				171	# Check error logs after 100 actions
				172	if logcheck_count > 100:
				173	if _error_in_logs(keypath, logdir, user, computes):
				174	test_succeeded = False
				175	break
				176	else:
				177	logcheck_count = 0
				178	else:
				179	logcheck_count = logcheck_count + 1
				180	# Cleanup
				181	logging.info('Cleaning up: terminating virtual machines...')
				182	vms = state.get_instances()
				183	active_vms = [v for _k, v in vms.iteritems() if v and v[1] == 'ACTIVE']
				184	for target in active_vms:
				185	manager.servers_client.delete_server(target[0]['id'])
				186	# check to see that the server was actually killed
				187	for target in active_vms:
				188	kill_id = target[0]['id']
				189	i = 0
				190	while True:
				191	try:
				192	manager.servers_client.get_server(kill_id)
				193	except Exception:
				194	break
				195	i += 1
				196	if i > 60:
				197	_error_in_logs(keypath, logdir, user, computes)
				198	raise Exception("Cleanup timed out")
				199	time.sleep(1)
				200	logging.info('killed %s' % kill_id)
				201	state.delete_instance_state(kill_id)
				202
				203	if test_succeeded:
				204	logging.info('* Test succeeded *')
				205	else:
				206	logging.info('* Test had errors *')
				207	return test_succeeded