Implement automated Ceph failover tests
Implement automated tests that verify reboot
of Ceph OSD, CMN and RGW nodes.
Related-PROD: PROD-32684
Change-Id: I020eafced9dbaccf168c6e4466abc545c68bcc02
diff --git a/tcp_tests/tests/system/test_failover_ceph.py b/tcp_tests/tests/system/test_failover_ceph.py
index 934e28b..2493083 100644
--- a/tcp_tests/tests/system/test_failover_ceph.py
+++ b/tcp_tests/tests/system/test_failover_ceph.py
@@ -13,28 +13,383 @@
# under the License.
import pytest
+from devops.helpers import helpers
+
from tcp_tests import logger
+from tcp_tests.utils import get_jenkins_job_stages
+from tcp_tests.utils import run_jenkins_job
LOG = logger.logger
class TestFailoverCeph(object):
- """Test class for testing MCP ceph failover"""
+ """Test class for testing MCP Ceph failover"""
- def get_ceph_health(self, underlay, node_names):
- """Get ceph health on the specified nodes
+ TEMPEST_JOB_NAME = 'cvp-tempest'
+ TEMPEST_JOB_PARAMETERS = {
+ 'TEMPEST_ENDPOINT_TYPE': 'internalURL',
+ 'TEMPEST_TEST_PATTERN': 'set=smoke'
+ }
- Returns the dict {<node_name>: <str>, }
- where <str> is the 'ceph -s' output
+ SANITY_JOB_NAME = 'cvp-sanity'
+ SANITY_JOB_PARAMETERS = {
+ 'EXTRA_PARAMS': {
+ 'envs': [
+ "tests_set=-k "
+ "'not test_ceph_health and not test_prometheus_alert_count'"
+ ]
+ }
+ }
+
+ JENKINS_START_TIMEOUT = 60
+ JENKINS_BUILD_TIMEOUT = 60 * 15
+
+ def get_ceph_health(self, ssh, node_names):
+ """Get Ceph health status on specified nodes
+
+ :param ssh: UnderlaySSHManager, tcp-qa SSH manager instance
+ :param node_names: list, full hostnames of Ceph OSD nodes
+ :return: dict, Ceph health status from each OSD node (output of
+ 'ceph -s' command executed on each node)
"""
- res = {
- node_name: underlay.check_call("ceph -s",
- node_name=node_name,
- raise_on_err=False)['stdout_str']
+ return {
+ node_name: ssh.check_call(
+ "ceph -s",
+ node_name=node_name,
+ raise_on_err=False)['stdout_str']
for node_name in node_names
}
- return res
+ def run_jenkins_job(
+ self, creds, name, parameters, start_timeout, build_timeout):
+ """Execute a Jenkins job with provided parameters
+
+ :param creds: dict, Jenkins url and user credentials
+ :param name: string, Jenkins job to execute
+ :param parameters: dict, parameters for Jenkins job
+ :parameter start_timeout: int, timeout to wait until build is started
+ :parameter build_timeout: int, timeout to wait until build is finished
+ :return: tuple, Jenkins job build execution status, high level
+ description of the build and verbose decription of executed job
+ stages
+ """
+ jenkins_url, jenkins_user, jenkins_pass = (
+ creds['url'], creds['user'], creds['pass'])
+ build_status = run_jenkins_job.run_job(
+ host=jenkins_url,
+ username=jenkins_user,
+ password=jenkins_pass,
+ start_timeout=start_timeout,
+ build_timeout=build_timeout,
+ verbose=False,
+ job_name=name,
+ job_parameters=parameters)
+
+ description, stages = get_jenkins_job_stages.get_deployment_result(
+ host=jenkins_url,
+ username=jenkins_user,
+ password=jenkins_pass,
+ job_name=name,
+ build_number='lastBuild')
+
+ return build_status, description, stages
+
+ @pytest.mark.grab_versions
+ @pytest.mark.restart_osd_node
+ def test_restart_osd_node(
+ self,
+ salt_actions,
+ underlay_actions,
+ show_step):
+ """Verify that Ceph OSD node is not affected by system restart
+
+ Scenario:
+ 1. Find Ceph OSD nodes
+ 2. Check Ceph cluster health before node restart (skipped until
+ PROD-31374 is fixed)
+ 3. Restart 1 Ceph OSD node
+ 4. Check Ceph cluster health after node restart (skipped until
+ PROD-31374 is fixed)
+ 5. Run Tempest smoke test suite
+ 6. Run test_ceph_status.py::test_ceph_osd and
+ test_services.py::test_check_services[osd] sanity tests
+
+ Duration: ~9 min
+ """
+ salt = salt_actions
+ ssh = underlay_actions
+
+ # Find Ceph OSD nodes
+ show_step(1)
+ tgt = "I@ceph:osd"
+ osd_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
+ # Select a node for the test
+ osd_host = osd_hosts[0]
+
+ # Check Ceph cluster health before node restart
+ show_step(2)
+ ceph_health = self.get_ceph_health(ssh, osd_hosts)
+ # FIXME: uncomment the check once PROD-31374 is fixed
+ # status = all(
+ # ["OK" in status for node, status in ceph_health.items()])
+ # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+
+ # Restart a Ceph OSD node
+ show_step(3)
+ LOG.info("Sending reboot command to '{}' node.".format(osd_host))
+ remote = ssh.remote(node_name=osd_host)
+ remote.execute_async("/sbin/shutdown -r now")
+
+ # Wait for restarted node to boot and become accessible
+ helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
+ echo_request = "echo"
+ echo_response = salt.local(
+ osd_host, "test.echo", echo_request)['return'][0]
+ assert echo_request == echo_response[osd_host], (
+ "Minion on node '{}' node is not responding after node "
+ "reboot.".format(osd_host)
+ )
+ LOG.info("'{}' node is back after reboot.".format(osd_host))
+
+ # Check Ceph cluster health after node restart
+ show_step(4)
+ ceph_health = self.get_ceph_health(ssh, osd_hosts) # noqa
+ # FIXME: uncomment the check once PROD-31374 is fixed
+ # status = all(
+ # ["OK" in status for node, status in ceph_health.items()])
+ # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+
+ # Run Tempest smoke test suite
+ show_step(5)
+ jenkins_creds = salt.get_cluster_jenkins_creds()
+ status, description, stages = self.run_jenkins_job(
+ jenkins_creds,
+ self.TEMPEST_JOB_NAME,
+ self.TEMPEST_JOB_PARAMETERS,
+ self.JENKINS_START_TIMEOUT,
+ self.JENKINS_BUILD_TIMEOUT
+ )
+ assert status == 'SUCCESS', (
+ "'{0}' job run status is {1} after executing Tempest smoke "
+ "tests. Please check the build:\n{2}\n\nExecuted build "
+ "stages:\n{3}".format(
+ self.TEMPEST_JOB_NAME, status, description, '\n'.join(stages))
+ )
+
+ # Run Sanity test
+ show_step(6)
+ status, description, stages = self.run_jenkins_job(
+ jenkins_creds,
+ self.SANITY_JOB_NAME,
+ self.SANITY_JOB_PARAMETERS,
+ self.JENKINS_START_TIMEOUT,
+ self.JENKINS_BUILD_TIMEOUT
+ )
+ assert status == 'SUCCESS', (
+ "'{0}' job run status is {1} after executing selected sanity "
+ "tests. Please check the build:\n{2}\n\nExecuted build "
+ "stages:\n{3}".format(
+ self.SANITY_JOB_NAME, status, description, '\n'.join(stages))
+ )
+
+ @pytest.mark.grab_versions
+ @pytest.mark.restart_cmn_node
+ def test_restart_cmn_node(
+ self,
+ salt_actions,
+ underlay_actions,
+ show_step):
+ """Verify that Ceph CMN node is not affected by system restart
+
+ Scenario:
+ 1. Find Ceph CMN nodes
+ 2. Check Ceph cluster health before node restart (skipped until
+ PROD-31374 is fixed)
+ 3. Restart 1 Ceph CMN node
+ 4. Check Ceph cluster health after node restart (skipped until
+ PROD-31374 is fixed)
+ 5. Run Tempest smoke test suite
+ 6. Run test_ceph_status.py::test_ceph_replicas and
+ test_services.py::test_check_services[cmn] sanity tests
+
+ Duration: ~9 min
+ """
+ salt = salt_actions
+ ssh = underlay_actions
+
+ # Find Ceph CMN nodes
+ show_step(1)
+ tgt = "I@ceph:mon"
+ cmn_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
+ # Select a node for the test
+ cmn_host = cmn_hosts[0]
+
+ # Check Ceph cluster health before node restart
+ show_step(2)
+ ceph_health = self.get_ceph_health(ssh, cmn_hosts)
+ # FIXME: uncomment the check once PROD-31374 is fixed
+ # status = all(
+ # ["OK" in status for node, status in ceph_health.items()])
+ # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+
+ # Restart a Ceph CMN node
+ show_step(3)
+ LOG.info("Sending reboot command to '{}' node.".format(cmn_host))
+ remote = ssh.remote(node_name=cmn_host)
+ remote.execute_async("/sbin/shutdown -r now")
+
+ # Wait for restarted node to boot and become accessible
+ helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
+ echo_request = "echo"
+ echo_response = salt.local(
+ cmn_host, "test.echo", echo_request)['return'][0]
+ assert echo_request == echo_response[cmn_host], (
+ "Minion on node '{}' node is not responding after node "
+ "reboot.".format(cmn_host)
+ )
+ LOG.info("'{}' node is back after reboot.".format(cmn_host))
+
+ # Check Ceph cluster health after node restart
+ show_step(4)
+ ceph_health = self.get_ceph_health(ssh, cmn_hosts) # noqa
+ # FIXME: uncomment the check once PROD-31374 is fixed
+ # status = all(
+ # ["OK" in status for node, status in ceph_health.items()])
+ # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+
+ # Run Tempest smoke test suite
+ show_step(5)
+ jenkins_creds = salt.get_cluster_jenkins_creds()
+ status, description, stages = self.run_jenkins_job(
+ jenkins_creds,
+ self.TEMPEST_JOB_NAME,
+ self.TEMPEST_JOB_PARAMETERS,
+ self.JENKINS_START_TIMEOUT,
+ self.JENKINS_BUILD_TIMEOUT
+ )
+ assert status == 'SUCCESS', (
+ "'{0}' job run status is {1} after executing Tempest smoke "
+ "tests. Please check the build:\n{2}\n\nExecuted build "
+ "stages:\n{3}".format(
+ self.TEMPEST_JOB_NAME, status, description, '\n'.join(stages))
+ )
+
+ # Run Sanity test
+ show_step(6)
+ status, description, stages = self.run_jenkins_job(
+ jenkins_creds,
+ self.SANITY_JOB_NAME,
+ self.SANITY_JOB_PARAMETERS,
+ self.JENKINS_START_TIMEOUT,
+ self.JENKINS_BUILD_TIMEOUT
+ )
+ assert status == 'SUCCESS', (
+ "'{0}' job run status is {1} after executing selected sanity "
+ "tests. Please check the build:\n{2}\n\nExecuted build "
+ "stages:\n{3}".format(
+ self.SANITY_JOB_NAME, status, description, '\n'.join(stages))
+ )
+
+ @pytest.mark.grab_versions
+ @pytest.mark.restart_rgw_node
+ def test_restart_rgw_node(
+ self,
+ salt_actions,
+ underlay_actions,
+ show_step):
+ """Verify that Ceph RGW node is not affected by system restart
+
+ Scenario:
+ 1. Find Ceph RGW nodes
+ 2. Check Ceph cluster health before node restart (skipped until
+ PROD-31374 is fixed)
+ 3. Restart 1 Ceph RGW node
+ 4. Check Ceph cluster health after node restart (skipped until
+ PROD-31374 is fixed)
+ 5. Run Tempest smoke test suite
+ 6. Run test_services.py::test_check_services[rgw] sanity test
+
+ Duration: ~9 min
+ """
+ salt = salt_actions
+ ssh = underlay_actions
+
+ # Find Ceph RGW nodes
+ show_step(1)
+ tgt = "I@ceph:radosgw"
+ rgw_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
+ # Select a node for the test
+ rgw_host = rgw_hosts[0]
+
+ # Check Ceph cluster health before node restart
+ show_step(2)
+ ceph_health = self.get_ceph_health(ssh, rgw_hosts)
+ # FIXME: uncomment the check once PROD-31374 is fixed
+ # status = all(
+ # ["OK" in status for node, status in ceph_health.items()])
+ # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+
+ # Restart a Ceph RGW node
+ show_step(3)
+ LOG.info("Sending reboot command to '{}' node.".format(rgw_host))
+ remote = ssh.remote(node_name=rgw_host)
+ remote.execute_async("/sbin/shutdown -r now")
+
+ # Wait for restarted node to boot and become accessible
+ helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
+ echo_request = "echo"
+ echo_response = salt.local(
+ rgw_host, "test.echo", echo_request)['return'][0]
+ assert echo_request == echo_response[rgw_host], (
+ "Minion on node '{}' node is not responding after node "
+ "reboot.".format(rgw_host)
+ )
+ LOG.info("'{}' node is back after reboot.".format(rgw_host))
+
+ # Check Ceph cluster health after node restart
+ show_step(4)
+ ceph_health = self.get_ceph_health(ssh, rgw_hosts) # noqa
+ # FIXME: uncomment the check once PROD-31374 is fixed
+ # status = all(
+ # ["OK" in status for node, status in ceph_health.items()])
+ # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+
+ # Run Tempest smoke test suite
+ show_step(5)
+ jenkins_creds = salt.get_cluster_jenkins_creds()
+ status, description, stages = self.run_jenkins_job(
+ jenkins_creds,
+ self.TEMPEST_JOB_NAME,
+ self.TEMPEST_JOB_PARAMETERS,
+ self.JENKINS_START_TIMEOUT,
+ self.JENKINS_BUILD_TIMEOUT
+ )
+ assert status == 'SUCCESS', (
+ "'{0}' job run status is {1} after executing Tempest smoke "
+ "tests. Please check the build:\n{2}\n\nExecuted build "
+ "stages:\n{3}".format(
+ self.TEMPEST_JOB_NAME, status, description, '\n'.join(stages))
+ )
+
+ # Run Sanity test
+ show_step(6)
+ status, description, stages = self.run_jenkins_job(
+ jenkins_creds,
+ self.SANITY_JOB_NAME,
+ self.SANITY_JOB_PARAMETERS,
+ self.JENKINS_START_TIMEOUT,
+ self.JENKINS_BUILD_TIMEOUT
+ )
+ assert status == 'SUCCESS', (
+ "'{0}' job run status is {1} after executing selected sanity "
+ "tests. Please check the build:\n{2}\n\nExecuted build "
+ "stages:\n{3}".format(
+ self.SANITY_JOB_NAME, status, description, '\n'.join(stages))
+ )
+
+ # #######################################################################
+ # ############# Tests for fuel-devops deployed environments #############
+ # #######################################################################
def show_failed_msg(self, failed):
return "There are failed tempest tests:\n\n {0}".format(
'\n\n '.join([(name + ': ' + detail)
@@ -42,10 +397,10 @@
@pytest.mark.grab_versions
@pytest.mark.fail_snapshot
- def test_restart_osd_node(self, func_name, underlay, config,
- openstack_deployed, ceph_deployed,
- openstack_actions, hardware,
- rally, show_step):
+ def _test_restart_osd_node(self, func_name, underlay, config,
+ openstack_deployed, ceph_deployed,
+ openstack_actions, hardware,
+ rally, show_step):
"""Test restart ceph osd node
Scenario:
@@ -108,11 +463,11 @@
@pytest.mark.grab_versions
@pytest.mark.fail_snapshot
- def test_restart_cmn_node(self, func_name, underlay, config,
- openstack_deployed, ceph_deployed,
- core_actions,
- salt_actions, openstack_actions,
- rally, show_step, hardware):
+ def _test_restart_cmn_node(self, func_name, underlay, config,
+ openstack_deployed, ceph_deployed,
+ core_actions,
+ salt_actions, openstack_actions,
+ rally, show_step, hardware):
"""Test restart ceph cmn node
Scenario:
@@ -175,11 +530,11 @@
@pytest.mark.grab_versions
@pytest.mark.fail_snapshot
- def test_restart_rgw_node(self, func_name, underlay, config,
- openstack_deployed, ceph_deployed,
- core_actions, hardware,
- salt_actions, openstack_actions,
- rally, show_step):
+ def _test_restart_rgw_node(self, func_name, underlay, config,
+ openstack_deployed, ceph_deployed,
+ core_actions, hardware,
+ salt_actions, openstack_actions,
+ rally, show_step):
"""Test restart ceph rgw node
Scenario: