Blame - wally/run_test.py - mcp/cvp-wally

2016-12-19 02:40:08 +0200

[diff] [blame]

1

import time

koder aka kdanilov

2016-12-24 02:18:54 +0200

[diff] [blame]

2

import json

kdanylov aka koder

2017-05-07 17:27:14 +0300

[diff] [blame]

3

import copy

koder aka kdanilov

e21d747

2015-02-14 19:02:04 -0800

[diff] [blame]

4

import logging

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

5

from concurrent.futures import Future

6

from typing import List, Dict, Tuple, Optional, Union, cast

koder aka kdanilov

88407ff

2015-05-26 15:35:57 +0300

[diff] [blame]

7

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

8

from . import utils, ssh_utils, hw_info

9

from .config import ConfigBlock

koder aka kdanilov

7308462

2016-11-16 21:51:08 +0200

[diff] [blame]

10

from .node import setup_rpc, connect

koder aka kdanilov

2016-12-26 01:34:23 +0200

[diff] [blame]

11

from .node_interfaces import NodeInfo, IRPCNode

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

12

from .stage import Stage, StepOrder

koder aka kdanilov

2016-12-26 01:34:23 +0200

[diff] [blame]

13

from .sensors import collect_sensors_data

koder aka kdanilov

2017-01-19 20:17:16 +0200

[diff] [blame]

14

from .suits.all_suits import all_suits

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

15

from .test_run_class import TestRun

koder aka kdanilov

2016-12-26 01:34:23 +0200

[diff] [blame]

16

from .utils import StopTestError

koder aka kdanilov

a732a60

2017-02-01 20:29:56 +0200

[diff] [blame]

17

from .result_classes import SuiteConfig

koder aka kdanilov

2017-01-19 20:17:16 +0200

[diff] [blame]

18

from .hlstorage import ResultStorage

koder aka kdanilov

63ad206

2015-04-27 13:11:40 +0300

[diff] [blame]

19

koder aka kdanilov

57ce4db

2015-04-25 21:25:51 +0300

[diff] [blame]

20

koder aka kdanilov

cff7b2e

2015-04-18 20:48:15 +0300

[diff] [blame]

21

logger = logging.getLogger("wally")

koder aka kdanilov

cee4334

2015-04-14 22:52:53 +0300

[diff] [blame]

22

koder aka kdanilov

7acd6bd

2015-02-12 14:28:30 -0800

[diff] [blame]

23

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

24

class ConnectStage(Stage):

25

"""Connect to nodes stage"""

koder aka kdanilov

e21d747

2015-02-14 19:02:04 -0800

[diff] [blame]

26

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

27

priority = StepOrder.CONNECT

koder aka kdanilov

2015-06-30 11:10:48 +0300

[diff] [blame]

28

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

29

def run(self, ctx: TestRun) -> None:

koder aka kdanilov

7308462

2016-11-16 21:51:08 +0200

[diff] [blame]

30

with ctx.get_pool() as pool:

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

31

logger.info("Connecting to %s nodes", len(ctx.nodes_info))

koder aka kdanilov

4af1c1d

2015-05-18 15:48:58 +0300

[diff] [blame]

32

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

33

def connect_ext(node_info: NodeInfo) -> Tuple[bool, Union[IRPCNode, NodeInfo]]:

34

try:

35

ssh_node = connect(node_info, conn_timeout=ctx.config.connect_timeout)

koder aka kdanilov

2016-12-24 02:18:54 +0200

[diff] [blame]

36

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

37

return True, setup_rpc(ssh_node,

38

ctx.rpc_code,

39

ctx.default_rpc_plugins,

40

log_level=ctx.config.rpc_log_level)

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

41

except Exception as exc:

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

42

logger.exception("During connect to %s: %s", node_info, exc)

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

43

return False, node_info

koder aka kdanilov

2015-06-30 11:10:48 +0300

[diff] [blame]

44

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

45

failed_testnodes = [] # type: List[NodeInfo]

46

failed_nodes = [] # type: List[NodeInfo]

47

ctx.nodes = []

koder aka kdanilov

2015-06-30 11:10:48 +0300

[diff] [blame]

48

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

49

for ok, node in pool.map(connect_ext, ctx.nodes_info.values()):

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

50

if not ok:

51

node = cast(NodeInfo, node)

52

if 'testnode' in node.roles:

53

failed_testnodes.append(node)

54

else:

55

failed_nodes.append(node)

56

else:

57

ctx.nodes.append(cast(IRPCNode, node))

koder aka kdanilov

22d134e

2016-11-08 11:33:19 +0200

[diff] [blame]

58

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

59

if failed_nodes:

60

msg = "Node(s) {} would be excluded - can't connect"

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

61

logger.warning(msg.format(", ".join(map(str, failed_nodes))))

koder aka kdanilov

4af1c1d

2015-05-18 15:48:58 +0300

[diff] [blame]

62

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

63

if failed_testnodes:

koder aka kdanilov

2016-12-24 02:18:54 +0200

[diff] [blame]

64

msg = "Can't start RPC on testnode(s) " + ",".join(map(str, failed_testnodes))

koder aka kdanilov

c368eb6

2015-04-28 18:22:01 +0300

[diff] [blame]

65

logger.error(msg)

66

raise utils.StopTestError(msg)

67

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

68

if not failed_nodes:

69

logger.info("All nodes connected successfully")

koder aka kdanilov

cee4334

2015-04-14 22:52:53 +0300

[diff] [blame]

70

kdanylov aka koder

2017-05-09 20:00:44 +0300

[diff] [blame^]

71

def get_time(node):

72

return node.conn.sys.time()

73

74

t_start = time.time()

75

tms = pool.map(get_time, ctx.nodes)

76

t_end = time.time()

77

78

for node, val in zip(ctx.nodes, tms):

79

max_delta = int(max(t_start - val, val - t_end) * 1000)

80

if max_delta > ctx.config.max_time_diff_ms:

81

msg = ("Too large time shift {}ms on node {}. Stopping test." +

82

" Fix time on cluster nodes and restart test, or change " +

83

"max_time_diff_ms(={}ms) setting in config").format(max_delta,

84

str(node),

85

ctx.config.max_time_diff_ms)

86

logger.error(msg)

87

raise StopTestError(msg)

88

if max_delta > 0:

89

logger.warning("Node %s has time shift at least %s ms", node, max_delta)

90

91

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

92

def cleanup(self, ctx: TestRun) -> None:

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

93

if ctx.config.get("download_rpc_logs", False):

kdanylov aka koder

2017-05-09 20:00:44 +0300

[diff] [blame^]

94

logger.info("Killing all outstanding processes")

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

95

for node in ctx.nodes:

kdanylov aka koder

2017-05-09 20:00:44 +0300

[diff] [blame^]

96

node.conn.cli.killall()

97

98

logger.info("Downloading RPC servers logs")

99

for node in ctx.nodes:

100

node.conn.cli.killall()

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

101

if node.rpc_log_file is not None:

koder aka kdanilov

2017-01-19 20:17:16 +0200

[diff] [blame]

102

nid = node.node_id

koder aka kdanilov

a732a60

2017-02-01 20:29:56 +0200

[diff] [blame]

103

path = "rpc_logs/{}.txt".format(nid)

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

104

node.conn.server.flush_logs()

105

log = node.get_file_content(node.rpc_log_file)

koder aka kdanilov

2016-12-26 01:34:23 +0200

[diff] [blame]

106

if path in ctx.storage:

koder aka kdanilov

ffaf48d

2016-12-27 02:25:29 +0200

[diff] [blame]

107

ctx.storage.append_raw(log, path)

koder aka kdanilov

2016-12-26 01:34:23 +0200

[diff] [blame]

108

else:

koder aka kdanilov

ffaf48d

2016-12-27 02:25:29 +0200

[diff] [blame]

109

ctx.storage.put_raw(log, path)

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

110

logger.debug("RPC log from node {} stored into storage::{}".format(nid, path))

111

kdanylov aka koder

2017-05-09 20:00:44 +0300

[diff] [blame^]

112

logger.info("Disconnecting")

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

113

with ctx.get_pool() as pool:

114

list(pool.map(lambda node: node.disconnect(stop=True), ctx.nodes))

koder aka kdanilov

cee4334

2015-04-14 22:52:53 +0300

[diff] [blame]

115

koder aka kdanilov

2015-06-30 11:10:48 +0300

[diff] [blame]

116

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

117

class CollectInfoStage(Stage):

118

"""Collect node info"""

koder aka kdanilov

3d2bc4f

2016-11-12 18:31:18 +0200

[diff] [blame]

119

koder aka kdanilov

2016-12-24 02:18:54 +0200

[diff] [blame]

120

priority = StepOrder.START_SENSORS - 2

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

121

config_block = 'collect_info'

122

123

def run(self, ctx: TestRun) -> None:

124

if not ctx.config.collect_info:

125

return

126

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

127

futures = {} # type: Dict[Tuple[str, str], Future]

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

128

129

with ctx.get_pool() as pool:

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

130

# can't make next RPC request until finish with previous

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

131

for node in ctx.nodes:

koder aka kdanilov

2017-01-19 20:17:16 +0200

[diff] [blame]

132

nid = node.node_id

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

133

hw_info_path = "hw_info/{}".format(nid)

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

134

if hw_info_path not in ctx.storage:

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

135

futures[(hw_info_path, nid)] = pool.submit(hw_info.get_hw_info, node)

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

136

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

137

for (path, nid), future in futures.items():

138

try:

koder aka kdanilov

2016-12-26 01:34:23 +0200

[diff] [blame]

139

ctx.storage.put(future.result(), path)

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

140

except Exception:

141

logger.exception("During collecting hardware info from %s", nid)

142

raise utils.StopTestError()

143

144

futures.clear()

145

for node in ctx.nodes:

koder aka kdanilov

2017-01-19 20:17:16 +0200

[diff] [blame]

146

nid = node.node_id

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

147

sw_info_path = "sw_info/{}".format(nid)

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

148

if sw_info_path not in ctx.storage:

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

149

futures[(sw_info_path, nid)] = pool.submit(hw_info.get_sw_info, node)

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

150

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

151

for (path, nid), future in futures.items():

152

try:

koder aka kdanilov

2016-12-26 01:34:23 +0200

[diff] [blame]

153

ctx.storage.put(future.result(), path)

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

154

except Exception:

155

logger.exception("During collecting software info from %s", nid)

156

raise utils.StopTestError()

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

157

158

159

class ExplicitNodesStage(Stage):

160

"""add explicit nodes"""

161

162

priority = StepOrder.DISCOVER

163

config_block = 'nodes'

164

165

def run(self, ctx: TestRun) -> None:

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

166

if 'all_nodes' in ctx.storage:

167

logger.info("Skip explicid nodes filling, as all_nodes all ready in storage")

168

return

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

169

koder aka kdanilov

bbbe1dc

2016-12-20 01:19:56 +0200

[diff] [blame]

170

for url, roles in ctx.config.get('nodes', {}).raw().items():

kdanylov aka koder

150b219

2017-04-01 16:53:01 +0300

[diff] [blame]

171

ctx.merge_node(ssh_utils.parse_ssh_uri(url), set(role.strip() for role in roles.split(",")))

koder aka kdanilov

bbbe1dc

2016-12-20 01:19:56 +0200

[diff] [blame]

172

logger.debug("Add node %s with roles %s", url, roles)

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

173

174

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

175

class SleepStage(Stage):

176

"""Save nodes list to file"""

177

178

priority = StepOrder.TEST

179

config_block = 'sleep'

180

181

def run(self, ctx: TestRun) -> None:

182

logger.debug("Will sleep for %r seconds", ctx.config.sleep)

kdanylov aka koder

2017-05-09 20:00:44 +0300

[diff] [blame^]

183

stime = time.time()

koder aka kdanilov

2016-12-19 02:40:08 +0200

[diff] [blame]

184

time.sleep(ctx.config.sleep)

kdanylov aka koder

2017-05-09 20:00:44 +0300

[diff] [blame^]

185

ctx.storage.put([int(stime), int(time.time())], 'idle')

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

186

187

koder aka kdanilov

2016-12-24 02:18:54 +0200

[diff] [blame]

188

class PrepareNodes(Stage):

189

priority = StepOrder.START_SENSORS - 1

def __init__(self):

Stage.__init__(self)

self.nodeepscrub_updated = False

194

self.noscrub_updated = False

195

196

def run(self, ctx: TestRun) -> None:

197

ceph_sett = ctx.config.get('ceph_settings', "").split()

198

if ceph_sett:

199

for node in ctx.nodes:

200

if "ceph-mon" in node.info.roles or "ceph-osd" in node.info.roles:

201

state = json.loads(node.run("ceph health --format json"))["summary"]["summary"]

202

if 'noscrub' in ceph_sett:

203

if 'noscrub' in state:

204

logger.debug("noscrub already set on cluster")

205

else:

206

logger.info("Applying noscrub settings to ceph cluster")

207

node.run("ceph osd set noscrub")

208

self.noscrub_updated = True

209

210

if 'nodeepscrub' in ceph_sett:

211

if 'nodeepscrub' in state:

212

logger.debug("noscrub already set on cluster")

213

else:

214

logger.info("Applying noscrub settings to ceph cluster")

215

node.run("ceph osd set noscrub")

216

self.nodeepscrub_updated = True

217

break

218

219

def cleanup(self, ctx: TestRun) -> None:

220

if self.nodeepscrub_updated or self.noscrub_updated:

221

for node in ctx.nodes:

222

if "ceph-mon" in node.info.roles or "ceph-osd" in node.info.roles :

223

if self.noscrub_updated:

224

logger.info("Reverting noscrub setting for ceph cluster")

225

node.run("ceph osd unset noscrub")

226

self.noscrub_updated = False

227

228

if self.nodeepscrub_updated:

229

logger.info("Reverting noscrub setting for ceph cluster")

230

node.run("ceph osd unset nodeepscrub")

231

self.nodeepscrub_updated = False

232

233

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

234

class RunTestsStage(Stage):

235

236

priority = StepOrder.TEST

237

config_block = 'tests'

238

239

def run(self, ctx: TestRun) -> None:

koder aka kdanilov

2016-12-30 03:35:11 +0200

[diff] [blame]

240

if ctx.config.no_tests:

241

logger.info("Skiping tests, as 'no_tests' config settings is True")

242

return

koder aka kdanilov

3d2bc4f

2016-11-12 18:31:18 +0200

[diff] [blame]

243

koder aka kdanilov

2016-12-30 03:35:11 +0200

[diff] [blame]

244

for suite_idx, test_suite in enumerate(ctx.config.get('tests', [])):

245

test_nodes = [node for node in ctx.nodes if 'testnode' in node.info.roles]

koder aka kdanilov

da45e88

2015-04-06 02:24:42 +0300

[diff] [blame]

246

koder aka kdanilov

2016-12-30 03:35:11 +0200

[diff] [blame]

247

if not test_nodes:

248

logger.error("No test nodes found")

249

raise StopTestError()

gstepanov

023c1e4

2015-04-08 15:50:19 +0300

[diff] [blame]

250

koder aka kdanilov

2016-12-30 03:35:11 +0200

[diff] [blame]

251

if len(test_suite) != 1:

252

logger.error("Test suite %s contain more than one test. Put each test in separated group", suite_idx)

253

raise StopTestError()

koder aka kdanilov

7022706

2016-11-26 23:23:21 +0200

[diff] [blame]

254

koder aka kdanilov

2016-12-30 03:35:11 +0200

[diff] [blame]

255

name, params = list(test_suite.items())[0]

256

vm_count = params.get('node_limit', None) # type: Optional[int]

koder aka kdanilov

7022706

2016-11-26 23:23:21 +0200

[diff] [blame]

257

koder aka kdanilov

2016-12-30 03:35:11 +0200

[diff] [blame]

258

# select test nodes

259

if vm_count is None:

260

curr_test_nodes = test_nodes

261

else:

262

curr_test_nodes = test_nodes[:vm_count]

koder aka kdanilov

7022706

2016-11-26 23:23:21 +0200

[diff] [blame]

263

koder aka kdanilov

2016-12-30 03:35:11 +0200

[diff] [blame]

264

if not curr_test_nodes:

265

logger.error("No nodes found for test, skipping it.")

266

continue

267

kdanylov aka koder

150b219

2017-04-01 16:53:01 +0300

[diff] [blame]

268

if name not in all_suits:

269

logger.error("Test suite %r not found. Only suits [%s] available", name, ", ".join(all_suits))

270

raise StopTestError()

271

koder aka kdanilov

2017-01-19 20:17:16 +0200

[diff] [blame]

272

test_cls = all_suits[name]

koder aka kdanilov

2016-12-30 03:35:11 +0200

[diff] [blame]

273

remote_dir = ctx.config.default_test_local_folder.format(name=name, uuid=ctx.config.run_uuid)

koder aka kdanilov

a732a60

2017-02-01 20:29:56 +0200

[diff] [blame]

274

suite = SuiteConfig(test_cls.name,

275

params=params,

276

run_uuid=ctx.config.run_uuid,

277

nodes=test_nodes,

278

remote_dir=remote_dir,

279

idx=suite_idx,

280

keep_raw_files=ctx.config.keep_raw_files)

koder aka kdanilov

2016-12-30 03:35:11 +0200

[diff] [blame]

281

koder aka kdanilov

2017-01-19 20:17:16 +0200

[diff] [blame]

282

test_cls(storage=ResultStorage(ctx.storage),

283

suite=suite,

koder aka kdanilov

2016-12-30 03:35:11 +0200

[diff] [blame]

284

on_idle=lambda: collect_sensors_data(ctx, False)).run()

gstepanov

023c1e4

2015-04-08 15:50:19 +0300

[diff] [blame]

285

koder aka kdanilov

2016-12-17 15:15:26 +0200

[diff] [blame]

286

@classmethod

287

def validate_config(cls, cfg: ConfigBlock) -> None:

288

pass

koder aka kdanilov

2016-12-26 01:34:23 +0200

[diff] [blame]

289

290

kdanylov aka koder

2017-05-07 17:27:14 +0300

[diff] [blame]

291

class SaveNodesStage(Stage):

292

"""Save nodes list to file"""

293

nodes_path = 'all_nodes'

294

params_path = 'all_nodes_params.js'

295

priority = StepOrder.UPDATE_NODES_INFO + 1

296

297

def run(self, ctx: TestRun) -> None:

298

infos = list(ctx.nodes_info.values())

299

params = {node.node_id: node.params for node in infos}

300

ninfos = [copy.copy(node) for node in infos]

301

for node in ninfos:

302

node.params = "in {!r} file".format(self.params_path)

303

ctx.storage.put_list(ninfos, self.nodes_path)

304

ctx.storage.put_raw(json.dumps(params).encode('utf8'), self.params_path)

305

306

koder aka kdanilov

2016-12-26 01:34:23 +0200

[diff] [blame]

307

class LoadStoredNodesStage(Stage):

308

priority = StepOrder.DISCOVER

309

310

def run(self, ctx: TestRun) -> None:

kdanylov aka koder

2017-05-07 17:27:14 +0300

[diff] [blame]

311

if SaveNodesStage.nodes_path in ctx.storage:

koder aka kdanilov

2016-12-26 01:34:23 +0200

[diff] [blame]

312

if ctx.nodes_info:

313

logger.error("Internal error: Some nodes already stored in " +

314

"nodes_info before LoadStoredNodesStage stage")

315

raise StopTestError()

kdanylov aka koder

2017-05-07 17:27:14 +0300

[diff] [blame]

316

317

nodes = {node.node_id: node for node in ctx.storage.load_list(NodeInfo, SaveNodesStage.nodes_path)}

318

319

if SaveNodesStage.params_path in ctx.storage:

320

params = json.loads(ctx.storage.get_raw(SaveNodesStage.params_path).decode('utf8'))

321

for node_id, node in nodes.items():

322

node.params = params.get(node_id, {})

323

324

ctx.nodes_info = nodes

koder aka kdanilov