Implement automated Ceph failover tests

Implement automated tests that verify reboot
of Ceph OSD, CMN and RGW nodes.

Related-PROD: PROD-32684
Change-Id: I020eafced9dbaccf168c6e4466abc545c68bcc02
diff --git a/tcp_tests/tests/system/test_failover_ceph.py b/tcp_tests/tests/system/test_failover_ceph.py
index 934e28b..2493083 100644
--- a/tcp_tests/tests/system/test_failover_ceph.py
+++ b/tcp_tests/tests/system/test_failover_ceph.py
@@ -13,28 +13,383 @@
 #    under the License.
 import pytest
 
+from devops.helpers import helpers
+
 from tcp_tests import logger
+from tcp_tests.utils import get_jenkins_job_stages
+from tcp_tests.utils import run_jenkins_job
 
 LOG = logger.logger
 
 
 class TestFailoverCeph(object):
-    """Test class for testing MCP ceph failover"""
+    """Test class for testing MCP Ceph failover"""
 
-    def get_ceph_health(self, underlay, node_names):
-        """Get ceph health on the specified nodes
+    TEMPEST_JOB_NAME = 'cvp-tempest'
+    TEMPEST_JOB_PARAMETERS = {
+            'TEMPEST_ENDPOINT_TYPE': 'internalURL',
+            'TEMPEST_TEST_PATTERN': 'set=smoke'
+    }
 
-        Returns the dict {<node_name>: <str>, }
-        where <str> is the 'ceph -s' output
+    SANITY_JOB_NAME = 'cvp-sanity'
+    SANITY_JOB_PARAMETERS = {
+        'EXTRA_PARAMS': {
+            'envs': [
+                "tests_set=-k "
+                "'not test_ceph_health and not test_prometheus_alert_count'"
+            ]
+        }
+    }
+
+    JENKINS_START_TIMEOUT = 60
+    JENKINS_BUILD_TIMEOUT = 60 * 15
+
+    def get_ceph_health(self, ssh, node_names):
+        """Get Ceph health status on specified nodes
+
+        :param ssh: UnderlaySSHManager, tcp-qa SSH manager instance
+        :param node_names: list, full hostnames of Ceph OSD nodes
+        :return: dict, Ceph health status from each OSD node (output of
+            'ceph -s' command executed on each node)
         """
-        res = {
-            node_name: underlay.check_call("ceph -s",
-                                           node_name=node_name,
-                                           raise_on_err=False)['stdout_str']
+        return {
+            node_name: ssh.check_call(
+                "ceph -s",
+                node_name=node_name,
+                raise_on_err=False)['stdout_str']
             for node_name in node_names
         }
-        return res
 
+    def run_jenkins_job(
+            self, creds, name, parameters, start_timeout, build_timeout):
+        """Execute a Jenkins job with provided parameters
+
+        :param creds: dict, Jenkins url and user credentials
+        :param name: string, Jenkins job to execute
+        :param parameters: dict, parameters for Jenkins job
+        :parameter start_timeout: int, timeout to wait until build is started
+        :parameter build_timeout: int, timeout to wait until build is finished
+        :return: tuple, Jenkins job build execution status, high level
+            description of the build and verbose decription of executed job
+            stages
+        """
+        jenkins_url, jenkins_user, jenkins_pass = (
+            creds['url'], creds['user'], creds['pass'])
+        build_status = run_jenkins_job.run_job(
+            host=jenkins_url,
+            username=jenkins_user,
+            password=jenkins_pass,
+            start_timeout=start_timeout,
+            build_timeout=build_timeout,
+            verbose=False,
+            job_name=name,
+            job_parameters=parameters)
+
+        description, stages = get_jenkins_job_stages.get_deployment_result(
+            host=jenkins_url,
+            username=jenkins_user,
+            password=jenkins_pass,
+            job_name=name,
+            build_number='lastBuild')
+
+        return build_status, description, stages
+
+    @pytest.mark.grab_versions
+    @pytest.mark.restart_osd_node
+    def test_restart_osd_node(
+            self,
+            salt_actions,
+            underlay_actions,
+            show_step):
+        """Verify that Ceph OSD node is not affected by system restart
+
+        Scenario:
+        1. Find Ceph OSD nodes
+        2. Check Ceph cluster health before node restart (skipped until
+            PROD-31374 is fixed)
+        3. Restart 1 Ceph OSD node
+        4. Check Ceph cluster health after node restart (skipped until
+            PROD-31374 is fixed)
+        5. Run Tempest smoke test suite
+        6. Run test_ceph_status.py::test_ceph_osd and
+            test_services.py::test_check_services[osd] sanity tests
+
+        Duration: ~9 min
+        """
+        salt = salt_actions
+        ssh = underlay_actions
+
+        # Find Ceph OSD nodes
+        show_step(1)
+        tgt = "I@ceph:osd"
+        osd_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
+        # Select a node for the test
+        osd_host = osd_hosts[0]
+
+        # Check Ceph cluster health before node restart
+        show_step(2)
+        ceph_health = self.get_ceph_health(ssh, osd_hosts)
+        # FIXME: uncomment the check once PROD-31374 is fixed
+        # status = all(
+        #     ["OK" in status for node, status in ceph_health.items()])
+        # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+
+        # Restart a Ceph OSD node
+        show_step(3)
+        LOG.info("Sending reboot command to '{}' node.".format(osd_host))
+        remote = ssh.remote(node_name=osd_host)
+        remote.execute_async("/sbin/shutdown -r now")
+
+        # Wait for restarted node to boot and become accessible
+        helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
+        echo_request = "echo"
+        echo_response = salt.local(
+            osd_host, "test.echo", echo_request)['return'][0]
+        assert echo_request == echo_response[osd_host], (
+            "Minion on node '{}' node is not responding after node "
+            "reboot.".format(osd_host)
+        )
+        LOG.info("'{}' node is back after reboot.".format(osd_host))
+
+        # Check Ceph cluster health after node restart
+        show_step(4)
+        ceph_health = self.get_ceph_health(ssh, osd_hosts) # noqa
+        # FIXME: uncomment the check once PROD-31374 is fixed
+        # status = all(
+        #     ["OK" in status for node, status in ceph_health.items()])
+        # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+
+        # Run Tempest smoke test suite
+        show_step(5)
+        jenkins_creds = salt.get_cluster_jenkins_creds()
+        status, description, stages = self.run_jenkins_job(
+            jenkins_creds,
+            self.TEMPEST_JOB_NAME,
+            self.TEMPEST_JOB_PARAMETERS,
+            self.JENKINS_START_TIMEOUT,
+            self.JENKINS_BUILD_TIMEOUT
+        )
+        assert status == 'SUCCESS', (
+            "'{0}' job run status is {1} after executing Tempest smoke "
+            "tests. Please check the build:\n{2}\n\nExecuted build "
+            "stages:\n{3}".format(
+                self.TEMPEST_JOB_NAME, status, description, '\n'.join(stages))
+        )
+
+        # Run Sanity test
+        show_step(6)
+        status, description, stages = self.run_jenkins_job(
+            jenkins_creds,
+            self.SANITY_JOB_NAME,
+            self.SANITY_JOB_PARAMETERS,
+            self.JENKINS_START_TIMEOUT,
+            self.JENKINS_BUILD_TIMEOUT
+        )
+        assert status == 'SUCCESS', (
+            "'{0}' job run status is {1} after executing selected sanity "
+            "tests. Please check the build:\n{2}\n\nExecuted build "
+            "stages:\n{3}".format(
+                self.SANITY_JOB_NAME, status, description, '\n'.join(stages))
+        )
+
+    @pytest.mark.grab_versions
+    @pytest.mark.restart_cmn_node
+    def test_restart_cmn_node(
+            self,
+            salt_actions,
+            underlay_actions,
+            show_step):
+        """Verify that Ceph CMN node is not affected by system restart
+
+        Scenario:
+        1. Find Ceph CMN nodes
+        2. Check Ceph cluster health before node restart (skipped until
+            PROD-31374 is fixed)
+        3. Restart 1 Ceph CMN node
+        4. Check Ceph cluster health after node restart (skipped until
+            PROD-31374 is fixed)
+        5. Run Tempest smoke test suite
+        6. Run test_ceph_status.py::test_ceph_replicas and
+            test_services.py::test_check_services[cmn] sanity tests
+
+        Duration: ~9 min
+        """
+        salt = salt_actions
+        ssh = underlay_actions
+
+        # Find Ceph CMN nodes
+        show_step(1)
+        tgt = "I@ceph:mon"
+        cmn_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
+        # Select a node for the test
+        cmn_host = cmn_hosts[0]
+
+        # Check Ceph cluster health before node restart
+        show_step(2)
+        ceph_health = self.get_ceph_health(ssh, cmn_hosts)
+        # FIXME: uncomment the check once PROD-31374 is fixed
+        # status = all(
+        #     ["OK" in status for node, status in ceph_health.items()])
+        # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+
+        # Restart a Ceph CMN node
+        show_step(3)
+        LOG.info("Sending reboot command to '{}' node.".format(cmn_host))
+        remote = ssh.remote(node_name=cmn_host)
+        remote.execute_async("/sbin/shutdown -r now")
+
+        # Wait for restarted node to boot and become accessible
+        helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
+        echo_request = "echo"
+        echo_response = salt.local(
+            cmn_host, "test.echo", echo_request)['return'][0]
+        assert echo_request == echo_response[cmn_host], (
+            "Minion on node '{}' node is not responding after node "
+            "reboot.".format(cmn_host)
+        )
+        LOG.info("'{}' node is back after reboot.".format(cmn_host))
+
+        # Check Ceph cluster health after node restart
+        show_step(4)
+        ceph_health = self.get_ceph_health(ssh, cmn_hosts) # noqa
+        # FIXME: uncomment the check once PROD-31374 is fixed
+        # status = all(
+        #     ["OK" in status for node, status in ceph_health.items()])
+        # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+
+        # Run Tempest smoke test suite
+        show_step(5)
+        jenkins_creds = salt.get_cluster_jenkins_creds()
+        status, description, stages = self.run_jenkins_job(
+            jenkins_creds,
+            self.TEMPEST_JOB_NAME,
+            self.TEMPEST_JOB_PARAMETERS,
+            self.JENKINS_START_TIMEOUT,
+            self.JENKINS_BUILD_TIMEOUT
+        )
+        assert status == 'SUCCESS', (
+            "'{0}' job run status is {1} after executing Tempest smoke "
+            "tests. Please check the build:\n{2}\n\nExecuted build "
+            "stages:\n{3}".format(
+                self.TEMPEST_JOB_NAME, status, description, '\n'.join(stages))
+        )
+
+        # Run Sanity test
+        show_step(6)
+        status, description, stages = self.run_jenkins_job(
+            jenkins_creds,
+            self.SANITY_JOB_NAME,
+            self.SANITY_JOB_PARAMETERS,
+            self.JENKINS_START_TIMEOUT,
+            self.JENKINS_BUILD_TIMEOUT
+        )
+        assert status == 'SUCCESS', (
+            "'{0}' job run status is {1} after executing selected sanity "
+            "tests. Please check the build:\n{2}\n\nExecuted build "
+            "stages:\n{3}".format(
+                self.SANITY_JOB_NAME, status, description, '\n'.join(stages))
+        )
+
+    @pytest.mark.grab_versions
+    @pytest.mark.restart_rgw_node
+    def test_restart_rgw_node(
+            self,
+            salt_actions,
+            underlay_actions,
+            show_step):
+        """Verify that Ceph RGW node is not affected by system restart
+
+        Scenario:
+        1. Find Ceph RGW nodes
+        2. Check Ceph cluster health before node restart (skipped until
+            PROD-31374 is fixed)
+        3. Restart 1 Ceph RGW node
+        4. Check Ceph cluster health after node restart (skipped until
+            PROD-31374 is fixed)
+        5. Run Tempest smoke test suite
+        6. Run test_services.py::test_check_services[rgw] sanity test
+
+        Duration: ~9 min
+        """
+        salt = salt_actions
+        ssh = underlay_actions
+
+        # Find Ceph RGW nodes
+        show_step(1)
+        tgt = "I@ceph:radosgw"
+        rgw_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
+        # Select a node for the test
+        rgw_host = rgw_hosts[0]
+
+        # Check Ceph cluster health before node restart
+        show_step(2)
+        ceph_health = self.get_ceph_health(ssh, rgw_hosts)
+        # FIXME: uncomment the check once PROD-31374 is fixed
+        # status = all(
+        #     ["OK" in status for node, status in ceph_health.items()])
+        # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+
+        # Restart a Ceph RGW node
+        show_step(3)
+        LOG.info("Sending reboot command to '{}' node.".format(rgw_host))
+        remote = ssh.remote(node_name=rgw_host)
+        remote.execute_async("/sbin/shutdown -r now")
+
+        # Wait for restarted node to boot and become accessible
+        helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
+        echo_request = "echo"
+        echo_response = salt.local(
+            rgw_host, "test.echo", echo_request)['return'][0]
+        assert echo_request == echo_response[rgw_host], (
+            "Minion on node '{}' node is not responding after node "
+            "reboot.".format(rgw_host)
+        )
+        LOG.info("'{}' node is back after reboot.".format(rgw_host))
+
+        # Check Ceph cluster health after node restart
+        show_step(4)
+        ceph_health = self.get_ceph_health(ssh, rgw_hosts) # noqa
+        # FIXME: uncomment the check once PROD-31374 is fixed
+        # status = all(
+        #     ["OK" in status for node, status in ceph_health.items()])
+        # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+
+        # Run Tempest smoke test suite
+        show_step(5)
+        jenkins_creds = salt.get_cluster_jenkins_creds()
+        status, description, stages = self.run_jenkins_job(
+            jenkins_creds,
+            self.TEMPEST_JOB_NAME,
+            self.TEMPEST_JOB_PARAMETERS,
+            self.JENKINS_START_TIMEOUT,
+            self.JENKINS_BUILD_TIMEOUT
+        )
+        assert status == 'SUCCESS', (
+            "'{0}' job run status is {1} after executing Tempest smoke "
+            "tests. Please check the build:\n{2}\n\nExecuted build "
+            "stages:\n{3}".format(
+                self.TEMPEST_JOB_NAME, status, description, '\n'.join(stages))
+        )
+
+        # Run Sanity test
+        show_step(6)
+        status, description, stages = self.run_jenkins_job(
+            jenkins_creds,
+            self.SANITY_JOB_NAME,
+            self.SANITY_JOB_PARAMETERS,
+            self.JENKINS_START_TIMEOUT,
+            self.JENKINS_BUILD_TIMEOUT
+        )
+        assert status == 'SUCCESS', (
+            "'{0}' job run status is {1} after executing selected sanity "
+            "tests. Please check the build:\n{2}\n\nExecuted build "
+            "stages:\n{3}".format(
+                self.SANITY_JOB_NAME, status, description, '\n'.join(stages))
+        )
+
+    # #######################################################################
+    # ############# Tests for fuel-devops deployed environments #############
+    # #######################################################################
     def show_failed_msg(self, failed):
         return "There are failed tempest tests:\n\n  {0}".format(
             '\n\n  '.join([(name + ': ' + detail)
@@ -42,10 +397,10 @@
 
     @pytest.mark.grab_versions
     @pytest.mark.fail_snapshot
-    def test_restart_osd_node(self, func_name, underlay, config,
-                              openstack_deployed, ceph_deployed,
-                              openstack_actions, hardware,
-                              rally, show_step):
+    def _test_restart_osd_node(self, func_name, underlay, config,
+                               openstack_deployed, ceph_deployed,
+                               openstack_actions, hardware,
+                               rally, show_step):
         """Test restart ceph osd node
 
         Scenario:
@@ -108,11 +463,11 @@
 
     @pytest.mark.grab_versions
     @pytest.mark.fail_snapshot
-    def test_restart_cmn_node(self, func_name, underlay, config,
-                              openstack_deployed, ceph_deployed,
-                              core_actions,
-                              salt_actions, openstack_actions,
-                              rally, show_step, hardware):
+    def _test_restart_cmn_node(self, func_name, underlay, config,
+                               openstack_deployed, ceph_deployed,
+                               core_actions,
+                               salt_actions, openstack_actions,
+                               rally, show_step, hardware):
         """Test restart ceph cmn node
 
         Scenario:
@@ -175,11 +530,11 @@
 
     @pytest.mark.grab_versions
     @pytest.mark.fail_snapshot
-    def test_restart_rgw_node(self, func_name, underlay, config,
-                              openstack_deployed, ceph_deployed,
-                              core_actions, hardware,
-                              salt_actions, openstack_actions,
-                              rally, show_step):
+    def _test_restart_rgw_node(self, func_name, underlay, config,
+                               openstack_deployed, ceph_deployed,
+                               core_actions, hardware,
+                               salt_actions, openstack_actions,
+                               rally, show_step):
         """Test restart ceph rgw node
 
         Scenario: