David Kranz | 6308ec2 | 2012-02-22 09:36:48 -0500 | [diff] [blame] | 1 | # Copyright 2011 Quanta Research Cambridge, Inc.
|
| 2 | #
|
| 3 | # Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 | # you may not use this file except in compliance with the License.
|
| 5 | # You may obtain a copy of the License at
|
| 6 | #
|
| 7 | # http://www.apache.org/licenses/LICENSE-2.0
|
| 8 | #
|
| 9 | # Unless required by applicable law or agreed to in writing, software
|
| 10 | # distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 | # See the License for the specific language governing permissions and
|
| 13 | # limitations under the License.
|
| 14 | """The entry point for the execution of a workloadTo execute a workload.
|
| 15 | Users pass in a description of the workload and a nova manager object
|
| 16 | to the bash_openstack function call"""
|
| 17 |
|
| 18 |
|
| 19 | import random
|
| 20 | import datetime
|
| 21 | import time
|
| 22 |
|
| 23 |
|
| 24 | # local imports
|
| 25 | from test_case import *
|
| 26 | from state import State
|
| 27 | import utils.util
|
| 28 | from config import StressConfig
|
| 29 |
|
| 30 | # setup logging to file
|
| 31 | logging.basicConfig(
|
| 32 | format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s',
|
| 33 | datefmt='%m-%d %H:%M:%S',
|
| 34 | filename="stress.debug.log",
|
| 35 | filemode="w",
|
| 36 | level=logging.DEBUG,
|
| 37 | )
|
| 38 |
|
| 39 | # define a Handler which writes INFO messages or higher to the sys.stdout
|
| 40 | _console = logging.StreamHandler()
|
| 41 | _console.setLevel(logging.INFO)
|
| 42 | # set a format which is simpler for console use
|
| 43 | _formatter = logging.Formatter('%(name)-20s: %(levelname)-8s %(message)s')
|
| 44 | # tell the handler to use this format
|
| 45 | _console.setFormatter(_formatter)
|
| 46 | # add the handler to the root logger
|
| 47 | logging.getLogger('').addHandler(_console)
|
| 48 |
|
| 49 |
|
| 50 | def _create_cases(choice_spec):
|
| 51 | """
|
| 52 | Generate a workload of tests from workload description
|
| 53 | """
|
| 54 | cases = []
|
| 55 | count = 0
|
| 56 | for choice in choice_spec:
|
| 57 | p = choice.probability
|
| 58 | for i in range(p):
|
| 59 | cases.append(choice)
|
| 60 | i = i + p
|
| 61 | count = count + p
|
| 62 | assert(count == 100)
|
| 63 | return cases
|
| 64 |
|
| 65 |
|
| 66 | def _get_compute_nodes(keypath, user, controller):
|
| 67 | """
|
| 68 | Returns a list of active compute nodes. List is generated by running
|
| 69 | nova-manage on the controller.
|
| 70 | """
|
| 71 | nodes = []
|
| 72 | if keypath == None or user == None:
|
| 73 | return nodes
|
| 74 | lines = utils.util.ssh(keypath, user, controller,
|
| 75 | "nova-manage service list | grep ^nova-compute").\
|
| 76 | split('\n')
|
| 77 | # For example: nova-compute xg11eth0 nova enabled :-) 2011-10-31 18:57:46
|
| 78 | # This is fragile but there is, at present, no other way to get this info.
|
| 79 | for line in lines:
|
| 80 | words = line.split()
|
| 81 | if len(words) > 0 and words[4] == ":-)":
|
| 82 | nodes.append(words[1])
|
| 83 | return nodes
|
| 84 |
|
| 85 |
|
| 86 | def _error_in_logs(keypath, logdir, user, nodes):
|
| 87 | """
|
| 88 | Detect errors in the nova log files on the controller and compute nodes.
|
| 89 | """
|
| 90 | grep = 'egrep "ERROR\|TRACE" %s/*.log' % logdir
|
| 91 | for node in nodes:
|
| 92 | errors = utils.util.ssh(keypath, user, node, grep, check=False)
|
| 93 | if len(errors) > 0:
|
| 94 | logging.error('%s: %s' % (node, errors))
|
| 95 | return True
|
| 96 | return False
|
| 97 |
|
| 98 |
|
| 99 | def bash_openstack(manager,
|
| 100 | choice_spec,
|
| 101 | **kwargs):
|
| 102 | """
|
| 103 | Workload driver. Executes a workload as specified by the `choice_spec`
|
| 104 | parameter against a nova-cluster.
|
| 105 |
|
| 106 | `manager` : Manager object
|
| 107 | `choice_spec` : list of BasherChoice actions to run on the cluster
|
| 108 | `kargs` : keyword arguments to the constructor of `test_case`
|
| 109 | `duration` = how long this test should last (3 sec)
|
| 110 | `sleep_time` = time to sleep between actions (in msec)
|
| 111 | `test_name` = human readable workload description
|
| 112 | (default: unnamed test)
|
| 113 | `max_vms` = maximum number of instances to launch
|
| 114 | (default: 32)
|
| 115 | `seed` = random seed (default: None)
|
| 116 | """
|
| 117 | # get keyword arguments
|
| 118 | duration = kwargs.get('duration', datetime.timedelta(seconds=10))
|
| 119 | seed = kwargs.get('seed', None)
|
| 120 | sleep_time = float(kwargs.get('sleep_time', 3000)) / 1000
|
| 121 | max_vms = int(kwargs.get('max_vms', 32))
|
| 122 | test_name = kwargs.get('test_name', 'unamed test')
|
| 123 |
|
| 124 | stress_config = StressConfig(manager.config._conf)
|
| 125 | keypath = stress_config.host_private_key_path
|
| 126 | user = stress_config.host_admin_user
|
| 127 | logdir = stress_config.nova_logdir
|
| 128 | computes = _get_compute_nodes(keypath, user, manager.config.identity.host)
|
| 129 | utils.util.execute_on_all(keypath, user, computes,
|
| 130 | "rm -f %s/*.log" % logdir)
|
| 131 | random.seed(seed)
|
| 132 | cases = _create_cases(choice_spec)
|
| 133 | test_end_time = time.time() + duration.seconds
|
| 134 | state = State(max_vms=max_vms)
|
| 135 |
|
| 136 | retry_list = []
|
| 137 | last_retry = time.time()
|
| 138 | cooldown = False
|
| 139 | logcheck_count = 0
|
| 140 | test_succeeded = True
|
| 141 | logging.debug('=== Test \"%s\" on %s ===' %
|
| 142 | (test_name, time.asctime(time.localtime())))
|
| 143 | for kw in kwargs:
|
| 144 | logging.debug('\t%s = %s', kw, kwargs[kw])
|
| 145 |
|
| 146 | while True:
|
| 147 | if not cooldown:
|
| 148 | if time.time() < test_end_time:
|
| 149 | case = random.choice(cases)
|
| 150 | logging.debug('Chose %s' % case)
|
| 151 | retry = case.invoke(manager, state)
|
| 152 | if retry != None:
|
| 153 | retry_list.append(retry)
|
| 154 | else:
|
| 155 | logging.info('Cooling down...')
|
| 156 | cooldown = True
|
| 157 | if cooldown and len(retry_list) == 0:
|
| 158 | if _error_in_logs(keypath, logdir, user, computes):
|
| 159 | test_succeeded = False
|
| 160 | break
|
| 161 | # Retry verifications every 5 seconds.
|
| 162 | if time.time() - last_retry > 5:
|
| 163 | logging.debug('retry verifications for %d tasks', len(retry_list))
|
| 164 | new_retry_list = []
|
| 165 | for v in retry_list:
|
| 166 | if not v.retry():
|
| 167 | new_retry_list.append(v)
|
| 168 | retry_list = new_retry_list
|
| 169 | last_retry = time.time()
|
| 170 | time.sleep(sleep_time)
|
| 171 | # Check error logs after 100 actions
|
| 172 | if logcheck_count > 100:
|
| 173 | if _error_in_logs(keypath, logdir, user, computes):
|
| 174 | test_succeeded = False
|
| 175 | break
|
| 176 | else:
|
| 177 | logcheck_count = 0
|
| 178 | else:
|
| 179 | logcheck_count = logcheck_count + 1
|
| 180 | # Cleanup
|
| 181 | logging.info('Cleaning up: terminating virtual machines...')
|
| 182 | vms = state.get_instances()
|
| 183 | active_vms = [v for _k, v in vms.iteritems() if v and v[1] == 'ACTIVE']
|
| 184 | for target in active_vms:
|
| 185 | manager.servers_client.delete_server(target[0]['id'])
|
| 186 | # check to see that the server was actually killed
|
| 187 | for target in active_vms:
|
| 188 | kill_id = target[0]['id']
|
| 189 | i = 0
|
| 190 | while True:
|
| 191 | try:
|
| 192 | manager.servers_client.get_server(kill_id)
|
| 193 | except Exception:
|
| 194 | break
|
| 195 | i += 1
|
| 196 | if i > 60:
|
| 197 | raise
|
| 198 | time.sleep(1)
|
| 199 | logging.info('killed %s' % kill_id)
|
| 200 | state.delete_instance_state(kill_id)
|
| 201 |
|
| 202 | if test_succeeded:
|
| 203 | logging.info('*** Test succeeded ***')
|
| 204 | else:
|
| 205 | logging.info('*** Test had errors ***')
|
| 206 | return test_succeeded
|