blob: 24930837eaaca0e3582b0e77bf8031ee8f0e3b80 [file] [log] [blame]
Tatyana Leontovichc72604d2018-01-04 17:58:00 +02001# Copyright 2017 Mirantis, Inc.
2#
3# Licensed under the Apache License, Version 2.0 (the "License"); you may
4# not use this file except in compliance with the License. You may obtain
5# a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12# License for the specific language governing permissions and limitations
13# under the License.
14import pytest
15
Dmitriy Kruglova1aecc22019-08-25 19:26:44 +020016from devops.helpers import helpers
17
Tatyana Leontovichc72604d2018-01-04 17:58:00 +020018from tcp_tests import logger
Dmitriy Kruglova1aecc22019-08-25 19:26:44 +020019from tcp_tests.utils import get_jenkins_job_stages
20from tcp_tests.utils import run_jenkins_job
Tatyana Leontovichc72604d2018-01-04 17:58:00 +020021
22LOG = logger.logger
23
24
25class TestFailoverCeph(object):
Dmitriy Kruglova1aecc22019-08-25 19:26:44 +020026 """Test class for testing MCP Ceph failover"""
Tatyana Leontovichc72604d2018-01-04 17:58:00 +020027
Dmitriy Kruglova1aecc22019-08-25 19:26:44 +020028 TEMPEST_JOB_NAME = 'cvp-tempest'
29 TEMPEST_JOB_PARAMETERS = {
30 'TEMPEST_ENDPOINT_TYPE': 'internalURL',
31 'TEMPEST_TEST_PATTERN': 'set=smoke'
32 }
Tatyana Leontovichc72604d2018-01-04 17:58:00 +020033
Dmitriy Kruglova1aecc22019-08-25 19:26:44 +020034 SANITY_JOB_NAME = 'cvp-sanity'
35 SANITY_JOB_PARAMETERS = {
36 'EXTRA_PARAMS': {
37 'envs': [
38 "tests_set=-k "
39 "'not test_ceph_health and not test_prometheus_alert_count'"
40 ]
41 }
42 }
43
44 JENKINS_START_TIMEOUT = 60
45 JENKINS_BUILD_TIMEOUT = 60 * 15
46
47 def get_ceph_health(self, ssh, node_names):
48 """Get Ceph health status on specified nodes
49
50 :param ssh: UnderlaySSHManager, tcp-qa SSH manager instance
51 :param node_names: list, full hostnames of Ceph OSD nodes
52 :return: dict, Ceph health status from each OSD node (output of
53 'ceph -s' command executed on each node)
Tatyana Leontovichc72604d2018-01-04 17:58:00 +020054 """
Dmitriy Kruglova1aecc22019-08-25 19:26:44 +020055 return {
56 node_name: ssh.check_call(
57 "ceph -s",
58 node_name=node_name,
59 raise_on_err=False)['stdout_str']
Tatyana Leontovichc72604d2018-01-04 17:58:00 +020060 for node_name in node_names
61 }
Tatyana Leontovichc72604d2018-01-04 17:58:00 +020062
Dmitriy Kruglova1aecc22019-08-25 19:26:44 +020063 def run_jenkins_job(
64 self, creds, name, parameters, start_timeout, build_timeout):
65 """Execute a Jenkins job with provided parameters
66
67 :param creds: dict, Jenkins url and user credentials
68 :param name: string, Jenkins job to execute
69 :param parameters: dict, parameters for Jenkins job
70 :parameter start_timeout: int, timeout to wait until build is started
71 :parameter build_timeout: int, timeout to wait until build is finished
72 :return: tuple, Jenkins job build execution status, high level
73 description of the build and verbose decription of executed job
74 stages
75 """
76 jenkins_url, jenkins_user, jenkins_pass = (
77 creds['url'], creds['user'], creds['pass'])
78 build_status = run_jenkins_job.run_job(
79 host=jenkins_url,
80 username=jenkins_user,
81 password=jenkins_pass,
82 start_timeout=start_timeout,
83 build_timeout=build_timeout,
84 verbose=False,
85 job_name=name,
86 job_parameters=parameters)
87
88 description, stages = get_jenkins_job_stages.get_deployment_result(
89 host=jenkins_url,
90 username=jenkins_user,
91 password=jenkins_pass,
92 job_name=name,
93 build_number='lastBuild')
94
95 return build_status, description, stages
96
97 @pytest.mark.grab_versions
98 @pytest.mark.restart_osd_node
99 def test_restart_osd_node(
100 self,
101 salt_actions,
102 underlay_actions,
103 show_step):
104 """Verify that Ceph OSD node is not affected by system restart
105
106 Scenario:
107 1. Find Ceph OSD nodes
108 2. Check Ceph cluster health before node restart (skipped until
109 PROD-31374 is fixed)
110 3. Restart 1 Ceph OSD node
111 4. Check Ceph cluster health after node restart (skipped until
112 PROD-31374 is fixed)
113 5. Run Tempest smoke test suite
114 6. Run test_ceph_status.py::test_ceph_osd and
115 test_services.py::test_check_services[osd] sanity tests
116
117 Duration: ~9 min
118 """
119 salt = salt_actions
120 ssh = underlay_actions
121
122 # Find Ceph OSD nodes
123 show_step(1)
124 tgt = "I@ceph:osd"
125 osd_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
126 # Select a node for the test
127 osd_host = osd_hosts[0]
128
129 # Check Ceph cluster health before node restart
130 show_step(2)
131 ceph_health = self.get_ceph_health(ssh, osd_hosts)
132 # FIXME: uncomment the check once PROD-31374 is fixed
133 # status = all(
134 # ["OK" in status for node, status in ceph_health.items()])
135 # assert status, "Ceph health is not OK: {0}".format(ceph_health)
136
137 # Restart a Ceph OSD node
138 show_step(3)
139 LOG.info("Sending reboot command to '{}' node.".format(osd_host))
140 remote = ssh.remote(node_name=osd_host)
141 remote.execute_async("/sbin/shutdown -r now")
142
143 # Wait for restarted node to boot and become accessible
144 helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
145 echo_request = "echo"
146 echo_response = salt.local(
147 osd_host, "test.echo", echo_request)['return'][0]
148 assert echo_request == echo_response[osd_host], (
149 "Minion on node '{}' node is not responding after node "
150 "reboot.".format(osd_host)
151 )
152 LOG.info("'{}' node is back after reboot.".format(osd_host))
153
154 # Check Ceph cluster health after node restart
155 show_step(4)
156 ceph_health = self.get_ceph_health(ssh, osd_hosts) # noqa
157 # FIXME: uncomment the check once PROD-31374 is fixed
158 # status = all(
159 # ["OK" in status for node, status in ceph_health.items()])
160 # assert status, "Ceph health is not OK: {0}".format(ceph_health)
161
162 # Run Tempest smoke test suite
163 show_step(5)
164 jenkins_creds = salt.get_cluster_jenkins_creds()
165 status, description, stages = self.run_jenkins_job(
166 jenkins_creds,
167 self.TEMPEST_JOB_NAME,
168 self.TEMPEST_JOB_PARAMETERS,
169 self.JENKINS_START_TIMEOUT,
170 self.JENKINS_BUILD_TIMEOUT
171 )
172 assert status == 'SUCCESS', (
173 "'{0}' job run status is {1} after executing Tempest smoke "
174 "tests. Please check the build:\n{2}\n\nExecuted build "
175 "stages:\n{3}".format(
176 self.TEMPEST_JOB_NAME, status, description, '\n'.join(stages))
177 )
178
179 # Run Sanity test
180 show_step(6)
181 status, description, stages = self.run_jenkins_job(
182 jenkins_creds,
183 self.SANITY_JOB_NAME,
184 self.SANITY_JOB_PARAMETERS,
185 self.JENKINS_START_TIMEOUT,
186 self.JENKINS_BUILD_TIMEOUT
187 )
188 assert status == 'SUCCESS', (
189 "'{0}' job run status is {1} after executing selected sanity "
190 "tests. Please check the build:\n{2}\n\nExecuted build "
191 "stages:\n{3}".format(
192 self.SANITY_JOB_NAME, status, description, '\n'.join(stages))
193 )
194
195 @pytest.mark.grab_versions
196 @pytest.mark.restart_cmn_node
197 def test_restart_cmn_node(
198 self,
199 salt_actions,
200 underlay_actions,
201 show_step):
202 """Verify that Ceph CMN node is not affected by system restart
203
204 Scenario:
205 1. Find Ceph CMN nodes
206 2. Check Ceph cluster health before node restart (skipped until
207 PROD-31374 is fixed)
208 3. Restart 1 Ceph CMN node
209 4. Check Ceph cluster health after node restart (skipped until
210 PROD-31374 is fixed)
211 5. Run Tempest smoke test suite
212 6. Run test_ceph_status.py::test_ceph_replicas and
213 test_services.py::test_check_services[cmn] sanity tests
214
215 Duration: ~9 min
216 """
217 salt = salt_actions
218 ssh = underlay_actions
219
220 # Find Ceph CMN nodes
221 show_step(1)
222 tgt = "I@ceph:mon"
223 cmn_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
224 # Select a node for the test
225 cmn_host = cmn_hosts[0]
226
227 # Check Ceph cluster health before node restart
228 show_step(2)
229 ceph_health = self.get_ceph_health(ssh, cmn_hosts)
230 # FIXME: uncomment the check once PROD-31374 is fixed
231 # status = all(
232 # ["OK" in status for node, status in ceph_health.items()])
233 # assert status, "Ceph health is not OK: {0}".format(ceph_health)
234
235 # Restart a Ceph CMN node
236 show_step(3)
237 LOG.info("Sending reboot command to '{}' node.".format(cmn_host))
238 remote = ssh.remote(node_name=cmn_host)
239 remote.execute_async("/sbin/shutdown -r now")
240
241 # Wait for restarted node to boot and become accessible
242 helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
243 echo_request = "echo"
244 echo_response = salt.local(
245 cmn_host, "test.echo", echo_request)['return'][0]
246 assert echo_request == echo_response[cmn_host], (
247 "Minion on node '{}' node is not responding after node "
248 "reboot.".format(cmn_host)
249 )
250 LOG.info("'{}' node is back after reboot.".format(cmn_host))
251
252 # Check Ceph cluster health after node restart
253 show_step(4)
254 ceph_health = self.get_ceph_health(ssh, cmn_hosts) # noqa
255 # FIXME: uncomment the check once PROD-31374 is fixed
256 # status = all(
257 # ["OK" in status for node, status in ceph_health.items()])
258 # assert status, "Ceph health is not OK: {0}".format(ceph_health)
259
260 # Run Tempest smoke test suite
261 show_step(5)
262 jenkins_creds = salt.get_cluster_jenkins_creds()
263 status, description, stages = self.run_jenkins_job(
264 jenkins_creds,
265 self.TEMPEST_JOB_NAME,
266 self.TEMPEST_JOB_PARAMETERS,
267 self.JENKINS_START_TIMEOUT,
268 self.JENKINS_BUILD_TIMEOUT
269 )
270 assert status == 'SUCCESS', (
271 "'{0}' job run status is {1} after executing Tempest smoke "
272 "tests. Please check the build:\n{2}\n\nExecuted build "
273 "stages:\n{3}".format(
274 self.TEMPEST_JOB_NAME, status, description, '\n'.join(stages))
275 )
276
277 # Run Sanity test
278 show_step(6)
279 status, description, stages = self.run_jenkins_job(
280 jenkins_creds,
281 self.SANITY_JOB_NAME,
282 self.SANITY_JOB_PARAMETERS,
283 self.JENKINS_START_TIMEOUT,
284 self.JENKINS_BUILD_TIMEOUT
285 )
286 assert status == 'SUCCESS', (
287 "'{0}' job run status is {1} after executing selected sanity "
288 "tests. Please check the build:\n{2}\n\nExecuted build "
289 "stages:\n{3}".format(
290 self.SANITY_JOB_NAME, status, description, '\n'.join(stages))
291 )
292
293 @pytest.mark.grab_versions
294 @pytest.mark.restart_rgw_node
295 def test_restart_rgw_node(
296 self,
297 salt_actions,
298 underlay_actions,
299 show_step):
300 """Verify that Ceph RGW node is not affected by system restart
301
302 Scenario:
303 1. Find Ceph RGW nodes
304 2. Check Ceph cluster health before node restart (skipped until
305 PROD-31374 is fixed)
306 3. Restart 1 Ceph RGW node
307 4. Check Ceph cluster health after node restart (skipped until
308 PROD-31374 is fixed)
309 5. Run Tempest smoke test suite
310 6. Run test_services.py::test_check_services[rgw] sanity test
311
312 Duration: ~9 min
313 """
314 salt = salt_actions
315 ssh = underlay_actions
316
317 # Find Ceph RGW nodes
318 show_step(1)
319 tgt = "I@ceph:radosgw"
320 rgw_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
321 # Select a node for the test
322 rgw_host = rgw_hosts[0]
323
324 # Check Ceph cluster health before node restart
325 show_step(2)
326 ceph_health = self.get_ceph_health(ssh, rgw_hosts)
327 # FIXME: uncomment the check once PROD-31374 is fixed
328 # status = all(
329 # ["OK" in status for node, status in ceph_health.items()])
330 # assert status, "Ceph health is not OK: {0}".format(ceph_health)
331
332 # Restart a Ceph RGW node
333 show_step(3)
334 LOG.info("Sending reboot command to '{}' node.".format(rgw_host))
335 remote = ssh.remote(node_name=rgw_host)
336 remote.execute_async("/sbin/shutdown -r now")
337
338 # Wait for restarted node to boot and become accessible
339 helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
340 echo_request = "echo"
341 echo_response = salt.local(
342 rgw_host, "test.echo", echo_request)['return'][0]
343 assert echo_request == echo_response[rgw_host], (
344 "Minion on node '{}' node is not responding after node "
345 "reboot.".format(rgw_host)
346 )
347 LOG.info("'{}' node is back after reboot.".format(rgw_host))
348
349 # Check Ceph cluster health after node restart
350 show_step(4)
351 ceph_health = self.get_ceph_health(ssh, rgw_hosts) # noqa
352 # FIXME: uncomment the check once PROD-31374 is fixed
353 # status = all(
354 # ["OK" in status for node, status in ceph_health.items()])
355 # assert status, "Ceph health is not OK: {0}".format(ceph_health)
356
357 # Run Tempest smoke test suite
358 show_step(5)
359 jenkins_creds = salt.get_cluster_jenkins_creds()
360 status, description, stages = self.run_jenkins_job(
361 jenkins_creds,
362 self.TEMPEST_JOB_NAME,
363 self.TEMPEST_JOB_PARAMETERS,
364 self.JENKINS_START_TIMEOUT,
365 self.JENKINS_BUILD_TIMEOUT
366 )
367 assert status == 'SUCCESS', (
368 "'{0}' job run status is {1} after executing Tempest smoke "
369 "tests. Please check the build:\n{2}\n\nExecuted build "
370 "stages:\n{3}".format(
371 self.TEMPEST_JOB_NAME, status, description, '\n'.join(stages))
372 )
373
374 # Run Sanity test
375 show_step(6)
376 status, description, stages = self.run_jenkins_job(
377 jenkins_creds,
378 self.SANITY_JOB_NAME,
379 self.SANITY_JOB_PARAMETERS,
380 self.JENKINS_START_TIMEOUT,
381 self.JENKINS_BUILD_TIMEOUT
382 )
383 assert status == 'SUCCESS', (
384 "'{0}' job run status is {1} after executing selected sanity "
385 "tests. Please check the build:\n{2}\n\nExecuted build "
386 "stages:\n{3}".format(
387 self.SANITY_JOB_NAME, status, description, '\n'.join(stages))
388 )
389
390 # #######################################################################
391 # ############# Tests for fuel-devops deployed environments #############
392 # #######################################################################
Tatyana Leontovichc72604d2018-01-04 17:58:00 +0200393 def show_failed_msg(self, failed):
394 return "There are failed tempest tests:\n\n {0}".format(
395 '\n\n '.join([(name + ': ' + detail)
396 for name, detail in failed.items()]))
397
398 @pytest.mark.grab_versions
399 @pytest.mark.fail_snapshot
Dmitriy Kruglova1aecc22019-08-25 19:26:44 +0200400 def _test_restart_osd_node(self, func_name, underlay, config,
401 openstack_deployed, ceph_deployed,
402 openstack_actions, hardware,
403 rally, show_step):
Tatyana Leontovichc72604d2018-01-04 17:58:00 +0200404 """Test restart ceph osd node
405
406 Scenario:
407 1. Find ceph osd nodes
408 2. Check ceph health before restart
409 3. Restart 1 ceph osd node
410 4. Check ceph health after restart
411 5. Run tempest smoke after failover
412 6. Check tempest report for failed tests
413
414 Requiremets:
415 - Salt cluster
416 - OpenStack cluster
417 - Ceph cluster
418 """
419 openstack_actions._salt.local(
420 tgt='*', fun='cmd.run',
421 args='service ntp stop; ntpd -gq; service ntp start')
422 # STEP #1
423 show_step(1)
424 osd_node_names = underlay.get_target_node_names(
425 target='osd')
426
427 # STEP #2
428 show_step(2)
429 # Get the ceph health output before restart
430 health_before = self.get_ceph_health(underlay, osd_node_names)
431 assert all(["OK" in p for n, p in health_before.items()]), (
432 "'Ceph health is not ok from node: {0}".format(health_before))
433
434 # STEP #3
435 show_step(3)
Vladimir Jigulinee1faa52018-06-25 13:00:51 +0400436 hardware.warm_restart_nodes(underlay, 'osd01')
Tatyana Leontovichc72604d2018-01-04 17:58:00 +0200437
438 openstack_actions._salt.local(
439 tgt='*', fun='cmd.run',
440 args='service ntp stop; ntpd -gq; service ntp start')
441
442 # STEP #4
443 show_step(4)
444 # Get the ceph health output after restart
445 health_after = self.get_ceph_health(underlay, osd_node_names)
446 assert all(["OK" in p for n, p in health_before.items()]), (
447 "'Ceph health is not ok from node: {0}".format(health_after))
448
449 rally.run_container()
450
451 # STEP #5
452 show_step(5)
453 results = rally.run_tempest(pattern='set=smoke',
454 conf_name='/var/lib/ceph_mcp.conf',
455 report_prefix=func_name,
456 designate_plugin=False,
457 timeout=1800)
458 # Step #6
459 show_step(6)
460 assert not results['fail'], self.show_failed_msg(results['fail'])
461
462 LOG.info("*************** DONE **************")
463
464 @pytest.mark.grab_versions
465 @pytest.mark.fail_snapshot
Dmitriy Kruglova1aecc22019-08-25 19:26:44 +0200466 def _test_restart_cmn_node(self, func_name, underlay, config,
467 openstack_deployed, ceph_deployed,
468 core_actions,
469 salt_actions, openstack_actions,
470 rally, show_step, hardware):
Tatyana Leontovichc72604d2018-01-04 17:58:00 +0200471 """Test restart ceph cmn node
472
473 Scenario:
474 1. Find ceph cmn nodes
475 2. Check ceph health before restart
476 3. Restart 1 ceph cmn node
477 4. Check ceph health after restart
478 5. Run tempest smoke after failover
479 6. Check tempest report for failed tests
480
481 Requiremets:
482 - Salt cluster
483 - OpenStack cluster
484 - Ceph cluster
485 """
486 openstack_actions._salt.local(
487 tgt='*', fun='cmd.run',
488 args='service ntp stop; ntpd -gq; service ntp start')
489 # STEP #1
490 show_step(1)
491 cmn_node_names = underlay.get_target_node_names(
492 target='cmn')
493
494 # STEP #2
495 show_step(2)
496 # Get the ceph health output before restart
497 health_before = self.get_ceph_health(underlay, cmn_node_names)
498 assert all(["OK" in p for n, p in health_before.items()]), (
499 "'Ceph health is not ok from node: {0}".format(health_before))
500
501 # STEP #3
502 show_step(3)
Vladimir Jigulinee1faa52018-06-25 13:00:51 +0400503 hardware.warm_restart_nodes(underlay, 'cmn01')
Tatyana Leontovichc72604d2018-01-04 17:58:00 +0200504
505 openstack_actions._salt.local(
506 tgt='*', fun='cmd.run',
507 args='service ntp stop; ntpd -gq; service ntp start')
508
509 # STEP #4
510 show_step(4)
511 # Get the ceph health output after restart
512 health_after = self.get_ceph_health(underlay, cmn_node_names)
513 assert all(["OK" in p for n, p in health_before.items()]), (
514 "'Ceph health is not ok from node: {0}".format(health_after))
515
516 rally.run_container()
517
518 # STEP #5
519 show_step(5)
520 results = rally.run_tempest(pattern='set=smoke',
521 conf_name='/var/lib/ceph_mcp.conf',
522 report_prefix=func_name,
523 designate_plugin=False,
524 timeout=1800)
525 # Step #6
526 show_step(6)
527 assert not results['fail'], self.show_failed_msg(results['fail'])
528
529 LOG.info("*************** DONE **************")
530
531 @pytest.mark.grab_versions
532 @pytest.mark.fail_snapshot
Dmitriy Kruglova1aecc22019-08-25 19:26:44 +0200533 def _test_restart_rgw_node(self, func_name, underlay, config,
534 openstack_deployed, ceph_deployed,
535 core_actions, hardware,
536 salt_actions, openstack_actions,
537 rally, show_step):
Tatyana Leontovichc72604d2018-01-04 17:58:00 +0200538 """Test restart ceph rgw node
539
540 Scenario:
541 1. Find ceph rgw nodes
542 2. Check ceph health before restart
543 3. Restart 1 ceph rgw node
544 4. Check ceph health after restart
545 5. Run tempest smoke after failover
546 6. Check tempest report for failed tests
547
548 Requiremets:
549 - Salt cluster
550 - OpenStack cluster
551 - Ceph cluster
552 """
553 openstack_actions._salt.local(
554 tgt='*', fun='cmd.run',
555 args='service ntp stop; ntpd -gq; service ntp start')
556
557 # STEP #1
558 show_step(1)
559 rgw_node_names = underlay.get_target_node_names(
560 target='rgw')
561 if not rgw_node_names:
562 pytest.skip('Skip as there are not rgw nodes in deploy')
563
564 # STEP #2
565 show_step(2)
566 # Get the ceph health output before restart
567 health_before = self.get_ceph_health(underlay, rgw_node_names)
568 assert all(["OK" in p for n, p in health_before.items()]), (
569 "'Ceph health is not ok from node: {0}".format(health_before))
570
571 # STEP #3
572 show_step(3)
Vladimir Jigulinee1faa52018-06-25 13:00:51 +0400573 hardware.warm_restart_nodes(underlay, 'rgw01')
Tatyana Leontovichc72604d2018-01-04 17:58:00 +0200574
575 openstack_actions._salt.local(
576 tgt='*', fun='cmd.run',
577 args='service ntp stop; ntpd -gq; service ntp start')
578
579 # STEP #4
580 show_step(4)
581 # Get the ceph health output after restart
582 health_after = self.get_ceph_health(underlay, rgw_node_names)
583 assert all(["OK" in p for n, p in health_before.items()]), (
584 "'Ceph health is not ok from node: {0}".format(health_after))
585
586 rally.run_container()
587
588 # STEP #5
589 show_step(5)
590 results = rally.run_tempest(pattern='set=smoke',
591 conf_name='/var/lib/ceph_mcp.conf',
592 designate_plugin=False,
593 report_prefix=func_name,
594 timeout=1800)
595 # Step #6
596 show_step(6)
597 assert not results['fail'], self.show_failed_msg(results['fail'])
598
599 LOG.info("*************** DONE **************")