tcp_tests/tests/system/test_failover_ceph.py - mcp/tcp-qa - Gitiles

 #    Copyright 2017 Mirantis, Inc.
 #
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
 #    not use this file except in compliance with the License. You may obtain
 #    a copy of the License at
 #
 #         http://www.apache.org/licenses/LICENSE-2.0
 #
 #    Unless required by applicable law or agreed to in writing, software
 #    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.
 import pytest

 from devops.helpers import helpers
 from tcp_tests import logger

 LOG = logger.logger


 class TestFailoverCeph(object):
     """Test class for testing MCP Ceph failover"""

     TEMPEST_JOB_NAME = 'cvp-tempest'
     TEMPEST_JOB_PARAMETERS = {
             'TEMPEST_ENDPOINT_TYPE': 'internalURL',
             'TEMPEST_TEST_PATTERN': 'set=smoke'
     }

     SANITY_JOB_NAME = 'cvp-sanity'
     SANITY_JOB_PARAMETERS = {
         'EXTRA_PARAMS': {
             'envs': [
                 "tests_set=-k "
                 "'not test_ceph_health and not test_prometheus_alert_count'"
             ]
         }
     }

     JENKINS_START_TIMEOUT = 60
     JENKINS_BUILD_TIMEOUT = 60 * 15

     def get_ceph_health(self, ssh, node_names):
         """Get Ceph health status on specified nodes

         :param ssh: UnderlaySSHManager, tcp-qa SSH manager instance
         :param node_names: list, full hostnames of Ceph OSD nodes
         :return: dict, Ceph health status from each OSD node (output of
             'ceph -s' command executed on each node)
         """
         return {
             node_name: ssh.check_call(
                 "ceph -s",
                 node_name=node_name,
                 raise_on_err=False)['stdout_str']
             for node_name in node_names
         }

     @pytest.mark.grab_versions
     @pytest.mark.restart_osd_node
     def test_restart_osd_node(
             self,
             salt_actions,
             drivetrain_actions,
             underlay_actions,
             show_step):
         """Verify that Ceph OSD node is not affected by system restart

         Scenario:
         1. Find Ceph OSD nodes
         2. Check Ceph cluster health before node restart (skipped until
             PROD-31374 is fixed)
         3. Restart 1 Ceph OSD node
         4. Check Ceph cluster health after node restart (skipped until
             PROD-31374 is fixed)
         5. Run Tempest smoke test suite
         6. Run test_ceph_status.py::test_ceph_osd and
             test_services.py::test_check_services[osd] sanity tests

         Duration: ~9 min
         """
         salt = salt_actions
         ssh = underlay_actions
         dt = drivetrain_actions

         # Find Ceph OSD nodes
         show_step(1)
         tgt = "I@ceph:osd"
         osd_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
         # Select a node for the test
         osd_host = osd_hosts[0]

         # Check Ceph cluster health before node restart
         show_step(2)
         ceph_health = self.get_ceph_health(ssh, osd_hosts)
         # FIXME: uncomment the check once PROD-31374 is fixed
         # status = all(
         #     ["OK" in status for node, status in ceph_health.items()])
         # assert status, "Ceph health is not OK: {0}".format(ceph_health)

         # Restart a Ceph OSD node
         show_step(3)
         LOG.info("Sending reboot command to '{}' node.".format(osd_host))
         remote = ssh.remote(node_name=osd_host)
         remote.execute_async("/sbin/shutdown -r now")

         # Wait for restarted node to boot and become accessible
         helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
         echo_request = "echo"
         echo_response = salt.local(
             osd_host, "test.echo", echo_request)['return'][0]
         assert echo_request == echo_response[osd_host], (
             "Minion on node '{}' node is not responding after node "
             "reboot.".format(osd_host)
         )
         LOG.info("'{}' node is back after reboot.".format(osd_host))

         # Check Ceph cluster health after node restart
         show_step(4)
         ceph_health = self.get_ceph_health(ssh, osd_hosts)  # noqa
         # FIXME: uncomment the check once PROD-31374 is fixed
         # status = all(
         #     ["OK" in status for node, status in ceph_health.items()])
         # assert status, "Ceph health is not OK: {0}".format(ceph_health)

         # Run Tempest smoke test suite
         show_step(5)
         status = dt.start_job_on_cid_jenkins(
             job_name=self.TEMPEST_JOB_NAME,
             job_parameters=self.TEMPEST_JOB_PARAMETERS,
             start_timeout=self.JENKINS_START_TIMEOUT,
             build_timeout=self.JENKINS_BUILD_TIMEOUT
         )

         assert status == 'SUCCESS', (
             "'{0}' job run status is {1} after executing Tempest smoke "
             "tests".format(
                 self.TEMPEST_JOB_NAME, status)
         )

         # Run Sanity test
         show_step(6)
         status = dt.start_job_on_cid_jenkins(
             job_name=self.SANITY_JOB_NAME,
             job_parameters=self.SANITY_JOB_PARAMETERS,
             start_timeout=self.JENKINS_START_TIMEOUT,
             build_timeout=self.JENKINS_BUILD_TIMEOUT
         )
         assert status == 'SUCCESS', (
             "'{0}' job run status is {1} after executing selected sanity "
             "tests".format(
                 self.SANITY_JOB_NAME, status)
         )

     @pytest.mark.grab_versions
     @pytest.mark.restart_cmn_node
     def test_restart_cmn_node(
             self,
             salt_actions,
             drivetrain_actions,
             underlay_actions,
             show_step):
         """Verify that Ceph CMN node is not affected by system restart

         Scenario:
         1. Find Ceph CMN nodes
         2. Check Ceph cluster health before node restart (skipped until
             PROD-31374 is fixed)
         3. Restart 1 Ceph CMN node
         4. Check Ceph cluster health after node restart (skipped until
             PROD-31374 is fixed)
         5. Run Tempest smoke test suite
         6. Run test_ceph_status.py::test_ceph_replicas and
             test_services.py::test_check_services[cmn] sanity tests

         Duration: ~9 min
         """
         salt = salt_actions
         ssh = underlay_actions
         dt = drivetrain_actions

         # Find Ceph CMN nodes
         show_step(1)
         tgt = "I@ceph:mon"
         cmn_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
         # Select a node for the test
         cmn_host = cmn_hosts[0]

         # Check Ceph cluster health before node restart
         show_step(2)
         ceph_health = self.get_ceph_health(ssh, cmn_hosts)
         # FIXME: uncomment the check once PROD-31374 is fixed
         # status = all(
         #     ["OK" in status for node, status in ceph_health.items()])
         # assert status, "Ceph health is not OK: {0}".format(ceph_health)

         # Restart a Ceph CMN node
         show_step(3)
         LOG.info("Sending reboot command to '{}' node.".format(cmn_host))
         remote = ssh.remote(node_name=cmn_host)
         remote.execute_async("/sbin/shutdown -r now")

         # Wait for restarted node to boot and become accessible
         helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
         echo_request = "echo"
         echo_response = salt.local(
             cmn_host, "test.echo", echo_request)['return'][0]
         assert echo_request == echo_response[cmn_host], (
             "Minion on node '{}' node is not responding after node "
             "reboot.".format(cmn_host)
         )
         LOG.info("'{}' node is back after reboot.".format(cmn_host))

         # Check Ceph cluster health after node restart
         show_step(4)
         ceph_health = self.get_ceph_health(ssh, cmn_hosts) # noqa
         # FIXME: uncomment the check once PROD-31374 is fixed
         # status = all(
         #     ["OK" in status for node, status in ceph_health.items()])
         # assert status, "Ceph health is not OK: {0}".format(ceph_health)

         # Run Tempest smoke test suite
         show_step(5)
         status = dt.start_job_on_cid_jenkins(
             job_name=self.TEMPEST_JOB_NAME,
             job_parameters=self.TEMPEST_JOB_PARAMETERS,
             start_timeout=self.JENKINS_START_TIMEOUT,
             build_timeout=self.JENKINS_BUILD_TIMEOUT
         )

         assert status == 'SUCCESS', (
             "'{0}' job run status is {1} after executing Tempest smoke "
             "tests".format(
                 self.TEMPEST_JOB_NAME, status)
         )

         # Run Sanity test
         show_step(6)
         status = dt.start_job_on_cid_jenkins(
             job_name=self.SANITY_JOB_NAME,
             job_parameters=self.SANITY_JOB_PARAMETERS,
             start_timeout=self.JENKINS_START_TIMEOUT,
             build_timeout=self.JENKINS_BUILD_TIMEOUT
         )
         assert status == 'SUCCESS', (
             "'{0}' job run status is {1} after executing selected sanity "
             "tests".format(
                 self.SANITY_JOB_NAME, status)
         )

     @pytest.mark.grab_versions
     @pytest.mark.restart_rgw_node
     def test_restart_rgw_node(
             self,
             salt_actions,
             drivetrain_actions,
             underlay_actions,
             show_step):
         """Verify that Ceph RGW node is not affected by system restart

         Scenario:
         1. Find Ceph RGW nodes
         2. Check Ceph cluster health before node restart (skipped until
             PROD-31374 is fixed)
         3. Restart 1 Ceph RGW node
         4. Check Ceph cluster health after node restart (skipped until
             PROD-31374 is fixed)
         5. Run Tempest smoke test suite
         6. Run test_services.py::test_check_services[rgw] sanity test

         Duration: ~9 min
         """
         salt = salt_actions
         ssh = underlay_actions
         dt = drivetrain_actions

         # Find Ceph RGW nodes
         show_step(1)
         tgt = "I@ceph:radosgw"
         rgw_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
         # Select a node for the test
         rgw_host = rgw_hosts[0]

         # Check Ceph cluster health before node restart
         show_step(2)
         ceph_health = self.get_ceph_health(ssh, rgw_hosts)
         # FIXME: uncomment the check once PROD-31374 is fixed
         # status = all(
         #     ["OK" in status for node, status in ceph_health.items()])
         # assert status, "Ceph health is not OK: {0}".format(ceph_health)

         # Restart a Ceph RGW node
         show_step(3)
         LOG.info("Sending reboot command to '{}' node.".format(rgw_host))
         remote = ssh.remote(node_name=rgw_host)
         remote.execute_async("/sbin/shutdown -r now")

         # Wait for restarted node to boot and become accessible
         helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
         echo_request = "echo"
         echo_response = salt.local(
             rgw_host, "test.echo", echo_request)['return'][0]
         assert echo_request == echo_response[rgw_host], (
             "Minion on node '{}' node is not responding after node "
             "reboot.".format(rgw_host)
         )
         LOG.info("'{}' node is back after reboot.".format(rgw_host))

         # Check Ceph cluster health after node restart
         show_step(4)
         ceph_health = self.get_ceph_health(ssh, rgw_hosts) # noqa
         # FIXME: uncomment the check once PROD-31374 is fixed
         # status = all(
         #     ["OK" in status for node, status in ceph_health.items()])
         # assert status, "Ceph health is not OK: {0}".format(ceph_health)

         # Run Tempest smoke test suite
         show_step(5)
         status = dt.start_job_on_cid_jenkins(
             job_name=self.TEMPEST_JOB_NAME,
             job_parameters=self.TEMPEST_JOB_PARAMETERS,
             start_timeout=self.JENKINS_START_TIMEOUT,
             build_timeout=self.JENKINS_BUILD_TIMEOUT
         )

         assert status == 'SUCCESS', (
             "'{0}' job run status is {1} after executing Tempest smoke "
             "tests".format(
                 self.TEMPEST_JOB_NAME, status)
         )

         # Run Sanity test
         show_step(6)
         status = dt.start_job_on_cid_jenkins(
             job_name=self.SANITY_JOB_NAME,
             job_parameters=self.SANITY_JOB_PARAMETERS,
             start_timeout=self.JENKINS_START_TIMEOUT,
             build_timeout=self.JENKINS_BUILD_TIMEOUT
         )
         assert status == 'SUCCESS', (
             "'{0}' job run status is {1} after executing selected sanity "
             "tests".format(
                 self.SANITY_JOB_NAME, status)
         )

     # #######################################################################
     # ############# Tests for fuel-devops deployed environments #############
     # #######################################################################
     def show_failed_msg(self, failed):
         return "There are failed tempest tests:\n\n  {0}".format(
             '\n\n  '.join([(name + ': ' + detail)
                            for name, detail in failed.items()]))

     @pytest.mark.grab_versions
     @pytest.mark.fail_snapshot
     def _test_restart_osd_node(self, func_name, underlay, config,
                                openstack_deployed, ceph_deployed,
                                openstack_actions, hardware,
                                rally, show_step):
         """Test restart ceph osd node

         Scenario:
             1. Find ceph osd nodes
             2. Check ceph health before restart
             3. Restart 1 ceph osd node
             4. Check ceph health after restart
             5. Run tempest smoke after failover
             6. Check tempest report for failed tests

         Requiremets:
             - Salt cluster
             - OpenStack cluster
             - Ceph cluster
         """
         openstack_actions._salt.local(
             tgt='*', fun='cmd.run',
             args='service ntp stop; ntpd -gq; service ntp start')
         # STEP #1
         show_step(1)
         osd_node_names = underlay.get_target_node_names(
             target='osd')

         # STEP #2
         show_step(2)
         # Get the ceph health output before restart
         health_before = self.get_ceph_health(underlay, osd_node_names)
         assert all(["OK" in p for n, p in health_before.items()]), (
             "'Ceph health is not ok from node: {0}".format(health_before))

         # STEP #3
         show_step(3)
         hardware.warm_restart_nodes(underlay, 'osd01')

         openstack_actions._salt.local(
             tgt='*', fun='cmd.run',
             args='service ntp stop; ntpd -gq; service ntp start')

         # STEP #4
         show_step(4)
         # Get the ceph health output after restart
         health_after = self.get_ceph_health(underlay, osd_node_names)
         assert all(["OK" in p for n, p in health_before.items()]), (
             "'Ceph health is not ok from node: {0}".format(health_after))

         rally.run_container()

         # STEP #5
         show_step(5)
         results = rally.run_tempest(pattern='set=smoke',
                                     conf_name='/var/lib/ceph_mcp.conf',
                                     report_prefix=func_name,
                                     designate_plugin=False,
                                     timeout=1800)
         # Step #6
         show_step(6)
         assert not results['fail'], self.show_failed_msg(results['fail'])

         LOG.info("*************** DONE **************")

     @pytest.mark.grab_versions
     @pytest.mark.fail_snapshot
     def _test_restart_cmn_node(self, func_name, underlay, config,
                                openstack_deployed, ceph_deployed,
                                core_actions,
                                salt_actions, openstack_actions,
                                rally, show_step, hardware):
         """Test restart ceph cmn node

         Scenario:
             1. Find ceph cmn nodes
             2. Check ceph health before restart
             3. Restart 1 ceph cmn node
             4. Check ceph health after restart
             5. Run tempest smoke after failover
             6. Check tempest report for failed tests

         Requiremets:
             - Salt cluster
             - OpenStack cluster
             - Ceph cluster
         """
         openstack_actions._salt.local(
             tgt='*', fun='cmd.run',
             args='service ntp stop; ntpd -gq; service ntp start')
         # STEP #1
         show_step(1)
         cmn_node_names = underlay.get_target_node_names(
             target='cmn')

         # STEP #2
         show_step(2)
         # Get the ceph health output before restart
         health_before = self.get_ceph_health(underlay, cmn_node_names)
         assert all(["OK" in p for n, p in health_before.items()]), (
             "'Ceph health is not ok from node: {0}".format(health_before))

         # STEP #3
         show_step(3)
         hardware.warm_restart_nodes(underlay, 'cmn01')

         openstack_actions._salt.local(
             tgt='*', fun='cmd.run',
             args='service ntp stop; ntpd -gq; service ntp start')

         # STEP #4
         show_step(4)
         # Get the ceph health output after restart
         health_after = self.get_ceph_health(underlay, cmn_node_names)
         assert all(["OK" in p for n, p in health_before.items()]), (
             "'Ceph health is not ok from node: {0}".format(health_after))

         rally.run_container()

         # STEP #5
         show_step(5)
         results = rally.run_tempest(pattern='set=smoke',
                                     conf_name='/var/lib/ceph_mcp.conf',
                                     report_prefix=func_name,
                                     designate_plugin=False,
                                     timeout=1800)
         # Step #6
         show_step(6)
         assert not results['fail'], self.show_failed_msg(results['fail'])

         LOG.info("*************** DONE **************")

     @pytest.mark.grab_versions
     @pytest.mark.fail_snapshot
     def _test_restart_rgw_node(self, func_name, underlay, config,
                                openstack_deployed, ceph_deployed,
                                core_actions, hardware,
                                salt_actions, openstack_actions,
                                rally, show_step):
         """Test restart ceph rgw node

         Scenario:
             1. Find ceph rgw nodes
             2. Check ceph health before restart
             3. Restart 1 ceph rgw node
             4. Check ceph health after restart
             5. Run tempest smoke after failover
             6. Check tempest report for failed tests

         Requiremets:
             - Salt cluster
             - OpenStack cluster
             - Ceph cluster
         """
         openstack_actions._salt.local(
             tgt='*', fun='cmd.run',
             args='service ntp stop; ntpd -gq; service ntp start')

         # STEP #1
         show_step(1)
         rgw_node_names = underlay.get_target_node_names(
             target='rgw')
         if not rgw_node_names:
             pytest.skip('Skip as there are not rgw nodes in deploy')

         # STEP #2
         show_step(2)
         # Get the ceph health output before restart
         health_before = self.get_ceph_health(underlay, rgw_node_names)
         assert all(["OK" in p for n, p in health_before.items()]), (
             "'Ceph health is not ok from node: {0}".format(health_before))

         # STEP #3
         show_step(3)
         hardware.warm_restart_nodes(underlay, 'rgw01')

         openstack_actions._salt.local(
             tgt='*', fun='cmd.run',
             args='service ntp stop; ntpd -gq; service ntp start')

         # STEP #4
         show_step(4)
         # Get the ceph health output after restart
         health_after = self.get_ceph_health(underlay, rgw_node_names)
         assert all(["OK" in p for n, p in health_before.items()]), (
             "'Ceph health is not ok from node: {0}".format(health_after))

         rally.run_container()

         # STEP #5
         show_step(5)
         results = rally.run_tempest(pattern='set=smoke',
                                     conf_name='/var/lib/ceph_mcp.conf',
                                     designate_plugin=False,
                                     report_prefix=func_name,
                                     timeout=1800)
         # Step #6
         show_step(6)
         assert not results['fail'], self.show_failed_msg(results['fail'])

         LOG.info("*************** DONE **************")
	# Copyright 2017 Mirantis, Inc.
	#
	# Licensed under the Apache License, Version 2.0 (the "License"); you may
	# not use this file except in compliance with the License. You may obtain
	# a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
	# License for the specific language governing permissions and limitations
	# under the License.
	import pytest

	from devops.helpers import helpers
	from tcp_tests import logger

	LOG = logger.logger


	class TestFailoverCeph(object):
	"""Test class for testing MCP Ceph failover"""

	TEMPEST_JOB_NAME = 'cvp-tempest'
	TEMPEST_JOB_PARAMETERS = {
	'TEMPEST_ENDPOINT_TYPE': 'internalURL',
	'TEMPEST_TEST_PATTERN': 'set=smoke'
	}

	SANITY_JOB_NAME = 'cvp-sanity'
	SANITY_JOB_PARAMETERS = {
	'EXTRA_PARAMS': {
	'envs': [
	"tests_set=-k "
	"'not test_ceph_health and not test_prometheus_alert_count'"
	]
	}
	}

	JENKINS_START_TIMEOUT = 60
	JENKINS_BUILD_TIMEOUT = 60 * 15

	def get_ceph_health(self, ssh, node_names):
	"""Get Ceph health status on specified nodes

	:param ssh: UnderlaySSHManager, tcp-qa SSH manager instance
	:param node_names: list, full hostnames of Ceph OSD nodes
	:return: dict, Ceph health status from each OSD node (output of
	'ceph -s' command executed on each node)
	"""
	return {
	node_name: ssh.check_call(
	"ceph -s",
	node_name=node_name,
	raise_on_err=False)['stdout_str']
	for node_name in node_names
	}

	@pytest.mark.grab_versions
	@pytest.mark.restart_osd_node
	def test_restart_osd_node(
	self,
	salt_actions,
	drivetrain_actions,
	underlay_actions,
	show_step):
	"""Verify that Ceph OSD node is not affected by system restart

	Scenario:
	1. Find Ceph OSD nodes
	2. Check Ceph cluster health before node restart (skipped until
	PROD-31374 is fixed)
	3. Restart 1 Ceph OSD node
	4. Check Ceph cluster health after node restart (skipped until
	PROD-31374 is fixed)
	5. Run Tempest smoke test suite
	6. Run test_ceph_status.py::test_ceph_osd and
	test_services.py::test_check_services[osd] sanity tests

	Duration: ~9 min
	"""
	salt = salt_actions
	ssh = underlay_actions
	dt = drivetrain_actions

	# Find Ceph OSD nodes
	show_step(1)
	tgt = "I@ceph:osd"
	osd_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
	# Select a node for the test
	osd_host = osd_hosts[0]

	# Check Ceph cluster health before node restart
	show_step(2)
	ceph_health = self.get_ceph_health(ssh, osd_hosts)
	# FIXME: uncomment the check once PROD-31374 is fixed
	# status = all(
	# ["OK" in status for node, status in ceph_health.items()])
	# assert status, "Ceph health is not OK: {0}".format(ceph_health)

	# Restart a Ceph OSD node
	show_step(3)
	LOG.info("Sending reboot command to '{}' node.".format(osd_host))
	remote = ssh.remote(node_name=osd_host)
	remote.execute_async("/sbin/shutdown -r now")

	# Wait for restarted node to boot and become accessible
	helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
	echo_request = "echo"
	echo_response = salt.local(
	osd_host, "test.echo", echo_request)['return'][0]
	assert echo_request == echo_response[osd_host], (
	"Minion on node '{}' node is not responding after node "
	"reboot.".format(osd_host)
	)
	LOG.info("'{}' node is back after reboot.".format(osd_host))

	# Check Ceph cluster health after node restart
	show_step(4)
	ceph_health = self.get_ceph_health(ssh, osd_hosts) # noqa
	# FIXME: uncomment the check once PROD-31374 is fixed
	# status = all(
	# ["OK" in status for node, status in ceph_health.items()])
	# assert status, "Ceph health is not OK: {0}".format(ceph_health)

	# Run Tempest smoke test suite
	show_step(5)
	status = dt.start_job_on_cid_jenkins(
	job_name=self.TEMPEST_JOB_NAME,
	job_parameters=self.TEMPEST_JOB_PARAMETERS,
	start_timeout=self.JENKINS_START_TIMEOUT,
	build_timeout=self.JENKINS_BUILD_TIMEOUT
	)

	assert status == 'SUCCESS', (
	"'{0}' job run status is {1} after executing Tempest smoke "
	"tests".format(
	self.TEMPEST_JOB_NAME, status)
	)

	# Run Sanity test
	show_step(6)
	status = dt.start_job_on_cid_jenkins(
	job_name=self.SANITY_JOB_NAME,
	job_parameters=self.SANITY_JOB_PARAMETERS,
	start_timeout=self.JENKINS_START_TIMEOUT,
	build_timeout=self.JENKINS_BUILD_TIMEOUT
	)
	assert status == 'SUCCESS', (
	"'{0}' job run status is {1} after executing selected sanity "
	"tests".format(
	self.SANITY_JOB_NAME, status)
	)

	@pytest.mark.grab_versions
	@pytest.mark.restart_cmn_node
	def test_restart_cmn_node(
	self,
	salt_actions,
	drivetrain_actions,
	underlay_actions,
	show_step):
	"""Verify that Ceph CMN node is not affected by system restart

	Scenario:
	1. Find Ceph CMN nodes
	2. Check Ceph cluster health before node restart (skipped until
	PROD-31374 is fixed)
	3. Restart 1 Ceph CMN node
	4. Check Ceph cluster health after node restart (skipped until
	PROD-31374 is fixed)
	5. Run Tempest smoke test suite
	6. Run test_ceph_status.py::test_ceph_replicas and
	test_services.py::test_check_services[cmn] sanity tests

	Duration: ~9 min
	"""
	salt = salt_actions
	ssh = underlay_actions
	dt = drivetrain_actions

	# Find Ceph CMN nodes
	show_step(1)
	tgt = "I@ceph:mon"
	cmn_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
	# Select a node for the test
	cmn_host = cmn_hosts[0]

	# Check Ceph cluster health before node restart
	show_step(2)
	ceph_health = self.get_ceph_health(ssh, cmn_hosts)
	# FIXME: uncomment the check once PROD-31374 is fixed
	# status = all(
	# ["OK" in status for node, status in ceph_health.items()])
	# assert status, "Ceph health is not OK: {0}".format(ceph_health)

	# Restart a Ceph CMN node
	show_step(3)
	LOG.info("Sending reboot command to '{}' node.".format(cmn_host))
	remote = ssh.remote(node_name=cmn_host)
	remote.execute_async("/sbin/shutdown -r now")

	# Wait for restarted node to boot and become accessible
	helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
	echo_request = "echo"
	echo_response = salt.local(
	cmn_host, "test.echo", echo_request)['return'][0]
	assert echo_request == echo_response[cmn_host], (
	"Minion on node '{}' node is not responding after node "
	"reboot.".format(cmn_host)
	)
	LOG.info("'{}' node is back after reboot.".format(cmn_host))

	# Check Ceph cluster health after node restart
	show_step(4)
	ceph_health = self.get_ceph_health(ssh, cmn_hosts) # noqa
	# FIXME: uncomment the check once PROD-31374 is fixed
	# status = all(
	# ["OK" in status for node, status in ceph_health.items()])
	# assert status, "Ceph health is not OK: {0}".format(ceph_health)

	# Run Tempest smoke test suite
	show_step(5)
	status = dt.start_job_on_cid_jenkins(
	job_name=self.TEMPEST_JOB_NAME,
	job_parameters=self.TEMPEST_JOB_PARAMETERS,
	start_timeout=self.JENKINS_START_TIMEOUT,
	build_timeout=self.JENKINS_BUILD_TIMEOUT
	)

	assert status == 'SUCCESS', (
	"'{0}' job run status is {1} after executing Tempest smoke "
	"tests".format(
	self.TEMPEST_JOB_NAME, status)
	)

	# Run Sanity test
	show_step(6)
	status = dt.start_job_on_cid_jenkins(
	job_name=self.SANITY_JOB_NAME,
	job_parameters=self.SANITY_JOB_PARAMETERS,
	start_timeout=self.JENKINS_START_TIMEOUT,
	build_timeout=self.JENKINS_BUILD_TIMEOUT
	)
	assert status == 'SUCCESS', (
	"'{0}' job run status is {1} after executing selected sanity "
	"tests".format(
	self.SANITY_JOB_NAME, status)
	)

	@pytest.mark.grab_versions
	@pytest.mark.restart_rgw_node
	def test_restart_rgw_node(
	self,
	salt_actions,
	drivetrain_actions,
	underlay_actions,
	show_step):
	"""Verify that Ceph RGW node is not affected by system restart

	Scenario:
	1. Find Ceph RGW nodes
	2. Check Ceph cluster health before node restart (skipped until
	PROD-31374 is fixed)
	3. Restart 1 Ceph RGW node
	4. Check Ceph cluster health after node restart (skipped until
	PROD-31374 is fixed)
	5. Run Tempest smoke test suite
	6. Run test_services.py::test_check_services[rgw] sanity test

	Duration: ~9 min
	"""
	salt = salt_actions
	ssh = underlay_actions
	dt = drivetrain_actions

	# Find Ceph RGW nodes
	show_step(1)
	tgt = "I@ceph:radosgw"
	rgw_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
	# Select a node for the test
	rgw_host = rgw_hosts[0]

	# Check Ceph cluster health before node restart
	show_step(2)
	ceph_health = self.get_ceph_health(ssh, rgw_hosts)
	# FIXME: uncomment the check once PROD-31374 is fixed
	# status = all(
	# ["OK" in status for node, status in ceph_health.items()])
	# assert status, "Ceph health is not OK: {0}".format(ceph_health)

	# Restart a Ceph RGW node
	show_step(3)
	LOG.info("Sending reboot command to '{}' node.".format(rgw_host))
	remote = ssh.remote(node_name=rgw_host)
	remote.execute_async("/sbin/shutdown -r now")

	# Wait for restarted node to boot and become accessible
	helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
	echo_request = "echo"
	echo_response = salt.local(
	rgw_host, "test.echo", echo_request)['return'][0]
	assert echo_request == echo_response[rgw_host], (
	"Minion on node '{}' node is not responding after node "
	"reboot.".format(rgw_host)
	)
	LOG.info("'{}' node is back after reboot.".format(rgw_host))

	# Check Ceph cluster health after node restart
	show_step(4)
	ceph_health = self.get_ceph_health(ssh, rgw_hosts) # noqa
	# FIXME: uncomment the check once PROD-31374 is fixed
	# status = all(
	# ["OK" in status for node, status in ceph_health.items()])
	# assert status, "Ceph health is not OK: {0}".format(ceph_health)

	# Run Tempest smoke test suite
	show_step(5)
	status = dt.start_job_on_cid_jenkins(
	job_name=self.TEMPEST_JOB_NAME,
	job_parameters=self.TEMPEST_JOB_PARAMETERS,
	start_timeout=self.JENKINS_START_TIMEOUT,
	build_timeout=self.JENKINS_BUILD_TIMEOUT
	)

	assert status == 'SUCCESS', (
	"'{0}' job run status is {1} after executing Tempest smoke "
	"tests".format(
	self.TEMPEST_JOB_NAME, status)
	)

	# Run Sanity test
	show_step(6)
	status = dt.start_job_on_cid_jenkins(
	job_name=self.SANITY_JOB_NAME,
	job_parameters=self.SANITY_JOB_PARAMETERS,
	start_timeout=self.JENKINS_START_TIMEOUT,
	build_timeout=self.JENKINS_BUILD_TIMEOUT
	)
	assert status == 'SUCCESS', (
	"'{0}' job run status is {1} after executing selected sanity "
	"tests".format(
	self.SANITY_JOB_NAME, status)
	)

	# #######################################################################
	# ############# Tests for fuel-devops deployed environments #############
	# #######################################################################
	def show_failed_msg(self, failed):
	return "There are failed tempest tests:\n\n {0}".format(
	'\n\n '.join([(name + ': ' + detail)
	for name, detail in failed.items()]))

	@pytest.mark.grab_versions
	@pytest.mark.fail_snapshot
	def _test_restart_osd_node(self, func_name, underlay, config,
	openstack_deployed, ceph_deployed,
	openstack_actions, hardware,
	rally, show_step):
	"""Test restart ceph osd node

	Scenario:
	1. Find ceph osd nodes
	2. Check ceph health before restart
	3. Restart 1 ceph osd node
	4. Check ceph health after restart
	5. Run tempest smoke after failover
	6. Check tempest report for failed tests

	Requiremets:
	- Salt cluster
	- OpenStack cluster
	- Ceph cluster
	"""
	openstack_actions._salt.local(
	tgt='*', fun='cmd.run',
	args='service ntp stop; ntpd -gq; service ntp start')
	# STEP #1
	show_step(1)
	osd_node_names = underlay.get_target_node_names(
	target='osd')

	# STEP #2
	show_step(2)
	# Get the ceph health output before restart
	health_before = self.get_ceph_health(underlay, osd_node_names)
	assert all(["OK" in p for n, p in health_before.items()]), (
	"'Ceph health is not ok from node: {0}".format(health_before))

	# STEP #3
	show_step(3)
	hardware.warm_restart_nodes(underlay, 'osd01')

	openstack_actions._salt.local(
	tgt='*', fun='cmd.run',
	args='service ntp stop; ntpd -gq; service ntp start')

	# STEP #4
	show_step(4)
	# Get the ceph health output after restart
	health_after = self.get_ceph_health(underlay, osd_node_names)
	assert all(["OK" in p for n, p in health_before.items()]), (
	"'Ceph health is not ok from node: {0}".format(health_after))

	rally.run_container()

	# STEP #5
	show_step(5)
	results = rally.run_tempest(pattern='set=smoke',
	conf_name='/var/lib/ceph_mcp.conf',
	report_prefix=func_name,
	designate_plugin=False,
	timeout=1800)
	# Step #6
	show_step(6)
	assert not results['fail'], self.show_failed_msg(results['fail'])

	LOG.info("************* DONE ************")

	@pytest.mark.grab_versions
	@pytest.mark.fail_snapshot
	def _test_restart_cmn_node(self, func_name, underlay, config,
	openstack_deployed, ceph_deployed,
	core_actions,
	salt_actions, openstack_actions,
	rally, show_step, hardware):
	"""Test restart ceph cmn node

	Scenario:
	1. Find ceph cmn nodes
	2. Check ceph health before restart
	3. Restart 1 ceph cmn node
	4. Check ceph health after restart
	5. Run tempest smoke after failover
	6. Check tempest report for failed tests

	Requiremets:
	- Salt cluster
	- OpenStack cluster
	- Ceph cluster
	"""
	openstack_actions._salt.local(
	tgt='*', fun='cmd.run',
	args='service ntp stop; ntpd -gq; service ntp start')
	# STEP #1
	show_step(1)
	cmn_node_names = underlay.get_target_node_names(
	target='cmn')

	# STEP #2
	show_step(2)
	# Get the ceph health output before restart
	health_before = self.get_ceph_health(underlay, cmn_node_names)
	assert all(["OK" in p for n, p in health_before.items()]), (
	"'Ceph health is not ok from node: {0}".format(health_before))

	# STEP #3
	show_step(3)
	hardware.warm_restart_nodes(underlay, 'cmn01')

	openstack_actions._salt.local(
	tgt='*', fun='cmd.run',
	args='service ntp stop; ntpd -gq; service ntp start')

	# STEP #4
	show_step(4)
	# Get the ceph health output after restart
	health_after = self.get_ceph_health(underlay, cmn_node_names)
	assert all(["OK" in p for n, p in health_before.items()]), (
	"'Ceph health is not ok from node: {0}".format(health_after))

	rally.run_container()

	# STEP #5
	show_step(5)
	results = rally.run_tempest(pattern='set=smoke',
	conf_name='/var/lib/ceph_mcp.conf',
	report_prefix=func_name,
	designate_plugin=False,
	timeout=1800)
	# Step #6
	show_step(6)
	assert not results['fail'], self.show_failed_msg(results['fail'])

	LOG.info("************* DONE ************")

	@pytest.mark.grab_versions
	@pytest.mark.fail_snapshot
	def _test_restart_rgw_node(self, func_name, underlay, config,
	openstack_deployed, ceph_deployed,
	core_actions, hardware,
	salt_actions, openstack_actions,
	rally, show_step):
	"""Test restart ceph rgw node

	Scenario:
	1. Find ceph rgw nodes
	2. Check ceph health before restart
	3. Restart 1 ceph rgw node
	4. Check ceph health after restart
	5. Run tempest smoke after failover
	6. Check tempest report for failed tests

	Requiremets:
	- Salt cluster
	- OpenStack cluster
	- Ceph cluster
	"""
	openstack_actions._salt.local(
	tgt='*', fun='cmd.run',
	args='service ntp stop; ntpd -gq; service ntp start')

	# STEP #1
	show_step(1)
	rgw_node_names = underlay.get_target_node_names(
	target='rgw')
	if not rgw_node_names:
	pytest.skip('Skip as there are not rgw nodes in deploy')

	# STEP #2
	show_step(2)
	# Get the ceph health output before restart
	health_before = self.get_ceph_health(underlay, rgw_node_names)
	assert all(["OK" in p for n, p in health_before.items()]), (
	"'Ceph health is not ok from node: {0}".format(health_before))

	# STEP #3
	show_step(3)
	hardware.warm_restart_nodes(underlay, 'rgw01')

	openstack_actions._salt.local(
	tgt='*', fun='cmd.run',
	args='service ntp stop; ntpd -gq; service ntp start')

	# STEP #4
	show_step(4)
	# Get the ceph health output after restart
	health_after = self.get_ceph_health(underlay, rgw_node_names)
	assert all(["OK" in p for n, p in health_before.items()]), (
	"'Ceph health is not ok from node: {0}".format(health_after))

	rally.run_container()

	# STEP #5
	show_step(5)
	results = rally.run_tempest(pattern='set=smoke',
	conf_name='/var/lib/ceph_mcp.conf',
	designate_plugin=False,
	report_prefix=func_name,
	timeout=1800)
	# Step #6
	show_step(6)
	assert not results['fail'], self.show_failed_msg(results['fail'])

	LOG.info("************* DONE ************")