blob: 9f263f62f15ff9a64478f6dda817e4727426591e [file] [log] [blame]
Sean Dague70112362012-04-03 13:48:49 -04001# Copyright 2011 Quanta Research Cambridge, Inc.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""The entry point for the execution of a workloadTo execute a workload.
15Users pass in a description of the workload and a nova manager object
16to the bash_openstack function call"""
17
18
19import random
20import datetime
21import time
22
23
24# local imports
25from test_case import *
26from state import State
27import utils.util
28from config import StressConfig
29
30# setup logging to file
31logging.basicConfig(
32 format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s',
33 datefmt='%m-%d %H:%M:%S',
34 filename="stress.debug.log",
35 filemode="w",
36 level=logging.DEBUG,
37 )
38
39# define a Handler which writes INFO messages or higher to the sys.stdout
40_console = logging.StreamHandler()
41_console.setLevel(logging.INFO)
42# set a format which is simpler for console use
43_formatter = logging.Formatter('%(name)-20s: %(levelname)-8s %(message)s')
44# tell the handler to use this format
45_console.setFormatter(_formatter)
46# add the handler to the root logger
47logging.getLogger('').addHandler(_console)
48
49
50def _create_cases(choice_spec):
51 """
52 Generate a workload of tests from workload description
53 """
54 cases = []
55 count = 0
56 for choice in choice_spec:
57 p = choice.probability
58 for i in range(p):
59 cases.append(choice)
60 i = i + p
61 count = count + p
62 assert(count == 100)
63 return cases
64
65
66def _get_compute_nodes(keypath, user, controller):
67 """
68 Returns a list of active compute nodes. List is generated by running
69 nova-manage on the controller.
70 """
71 nodes = []
72 if keypath == None or user == None:
73 return nodes
74 lines = utils.util.ssh(keypath, user, controller,
75 "nova-manage service list | grep ^nova-compute").\
76 split('\n')
77 # For example: nova-compute xg11eth0 nova enabled :-) 2011-10-31 18:57:46
78 # This is fragile but there is, at present, no other way to get this info.
79 for line in lines:
80 words = line.split()
81 if len(words) > 0 and words[4] == ":-)":
82 nodes.append(words[1])
83 return nodes
84
85
86def _error_in_logs(keypath, logdir, user, nodes):
87 """
88 Detect errors in the nova log files on the controller and compute nodes.
89 """
90 grep = 'egrep "ERROR\|TRACE" %s/*.log' % logdir
91 for node in nodes:
92 errors = utils.util.ssh(keypath, user, node, grep, check=False)
93 if len(errors) > 0:
94 logging.error('%s: %s' % (node, errors))
95 return True
96 return False
97
98
99def bash_openstack(manager,
100 choice_spec,
101 **kwargs):
102 """
103 Workload driver. Executes a workload as specified by the `choice_spec`
104 parameter against a nova-cluster.
105
106 `manager` : Manager object
107 `choice_spec` : list of BasherChoice actions to run on the cluster
108 `kargs` : keyword arguments to the constructor of `test_case`
109 `duration` = how long this test should last (3 sec)
110 `sleep_time` = time to sleep between actions (in msec)
111 `test_name` = human readable workload description
112 (default: unnamed test)
113 `max_vms` = maximum number of instances to launch
114 (default: 32)
115 `seed` = random seed (default: None)
116 """
117 stress_config = StressConfig(manager.config._conf)
118 # get keyword arguments
119 duration = kwargs.get('duration', datetime.timedelta(seconds=10))
120 seed = kwargs.get('seed', None)
121 sleep_time = float(kwargs.get('sleep_time', 3000)) / 1000
122 max_vms = int(kwargs.get('max_vms', stress_config.max_instances))
123 test_name = kwargs.get('test_name', 'unamed test')
124
125 keypath = stress_config.host_private_key_path
126 user = stress_config.host_admin_user
127 logdir = stress_config.nova_logdir
128 computes = _get_compute_nodes(keypath, user, manager.config.identity.host)
129 utils.util.execute_on_all(keypath, user, computes,
130 "rm -f %s/*.log" % logdir)
131 random.seed(seed)
132 cases = _create_cases(choice_spec)
133 test_end_time = time.time() + duration.seconds
134 state = State(max_vms=max_vms)
135
136 retry_list = []
137 last_retry = time.time()
138 cooldown = False
139 logcheck_count = 0
140 test_succeeded = True
141 logging.debug('=== Test \"%s\" on %s ===' %
142 (test_name, time.asctime(time.localtime())))
143 for kw in kwargs:
144 logging.debug('\t%s = %s', kw, kwargs[kw])
145
146 while True:
147 if not cooldown:
148 if time.time() < test_end_time:
149 case = random.choice(cases)
150 logging.debug('Chose %s' % case)
151 retry = case.invoke(manager, state)
152 if retry != None:
153 retry_list.append(retry)
154 else:
155 logging.info('Cooling down...')
156 cooldown = True
157 if cooldown and len(retry_list) == 0:
158 if _error_in_logs(keypath, logdir, user, computes):
159 test_succeeded = False
160 break
161 # Retry verifications every 5 seconds.
162 if time.time() - last_retry > 5:
163 logging.debug('retry verifications for %d tasks', len(retry_list))
164 new_retry_list = []
165 for v in retry_list:
166 if not v.retry():
167 new_retry_list.append(v)
168 retry_list = new_retry_list
169 last_retry = time.time()
170 time.sleep(sleep_time)
171 # Check error logs after 100 actions
172 if logcheck_count > 100:
173 if _error_in_logs(keypath, logdir, user, computes):
174 test_succeeded = False
175 break
176 else:
177 logcheck_count = 0
178 else:
179 logcheck_count = logcheck_count + 1
180 # Cleanup
181 logging.info('Cleaning up: terminating virtual machines...')
182 vms = state.get_instances()
183 active_vms = [v for _k, v in vms.iteritems() if v and v[1] == 'ACTIVE']
184 for target in active_vms:
185 manager.servers_client.delete_server(target[0]['id'])
186 # check to see that the server was actually killed
187 for target in active_vms:
188 kill_id = target[0]['id']
189 i = 0
190 while True:
191 try:
192 manager.servers_client.get_server(kill_id)
193 except Exception:
194 break
195 i += 1
196 if i > 60:
197 _error_in_logs(keypath, logdir, user, computes)
198 raise Exception("Cleanup timed out")
199 time.sleep(1)
200 logging.info('killed %s' % kill_id)
201 state.delete_instance_state(kill_id)
202
203 if test_succeeded:
204 logging.info('*** Test succeeded ***')
205 else:
206 logging.info('*** Test had errors ***')
207 return test_succeeded