Review tests

Add restart salt-minion in add_ceph_node tests
Return the Ceph health check back in ceph_failover tests
Wait for healthy CEPH after each node restart in ceph_failover tests
Change cvp-sanity and tempest parameters for ceph_failover tests
Add JJB template for Ceph Failover tests
Fix parameters to start SaltMaster backup/restore tests

PROD-36643

Change-Id: I52017158d07373d7cb90846e42edb4276e385552
diff --git a/tcp_tests/tests/system/test_ceph_operations.py b/tcp_tests/tests/system/test_ceph_operations.py
index b2f98b0..55791ca 100644
--- a/tcp_tests/tests/system/test_ceph_operations.py
+++ b/tcp_tests/tests/system/test_ceph_operations.py
@@ -1,3 +1,5 @@
+import time
+
 import pytest
 
 from tcp_tests import logger
@@ -27,10 +29,11 @@
             node_name=cfg_node,
             raise_on_err=False)
     # Need to restart salt-minion service after accepting it in Salt Master
-    # underlay_actions.check_call(
-    #     "systemctl restart salt-minion",
-    #     node_name=xtra_node,
-    #     raise_on_err=False)
+    underlay_actions.check_call(
+        "systemctl restart salt-minion",
+        node_name=xtra_node,
+        raise_on_err=False)
+    time.sleep(15)
     # salt_actions.enforce_state("xtra*", "linux")
     # salt_actions.enforce_state("xtra*", "openssh")
 
@@ -469,6 +472,7 @@
                 #OSDSETTINGS
                 #MONSETTINGS
                 #RGWSETTINGS
+                #MGRSETTINGS
                 linux_network_interfaces:
                   br_ctl:
                     address: ${_param:ceph_#NODE_node04_address}
@@ -499,6 +503,7 @@
         'OSDSETTINGS': '',
         'MONSETTINGS': '',
         'RGWSETTINGS': '',
+        'MGRSETTINGS': '',
 
     }
     # # ------------------OSD specific settings ----------
@@ -521,6 +526,10 @@
                 keepalived_vip_priority: 104
                 """  # noqa: E501
 
+    # # ------------------MGR specific settings -----------
+    if node == 'mgr':
+        data['MGRSETTINGS'] = ""
+
     yaml_config = template.substitute(data)
 
     return yaml_config
diff --git a/tcp_tests/tests/system/test_failover_ceph.py b/tcp_tests/tests/system/test_failover_ceph.py
index a89d711..02d7d28 100644
--- a/tcp_tests/tests/system/test_failover_ceph.py
+++ b/tcp_tests/tests/system/test_failover_ceph.py
@@ -13,6 +13,7 @@
 #    under the License.
 import pytest
 
+import time
 from devops.helpers import helpers
 from tcp_tests import logger
 
@@ -33,13 +34,14 @@
         'EXTRA_PARAMS': {
             'envs': [
                 "tests_set=-k "
-                "'not test_ceph_health and not test_prometheus_alert_count'"
+                "'not salt_master and not test_ceph_health and not "
+                "test_prometheus_alert_count'"
             ]
         }
     }
 
     JENKINS_START_TIMEOUT = 60
-    JENKINS_BUILD_TIMEOUT = 60 * 15
+    JENKINS_BUILD_TIMEOUT = 60 * 25
 
     def get_ceph_health(self, ssh, node_names):
         """Get Ceph health status on specified nodes
@@ -51,12 +53,36 @@
         """
         return {
             node_name: ssh.check_call(
-                "ceph -s",
+                "ceph health",
                 node_name=node_name,
                 raise_on_err=False)['stdout_str']
             for node_name in node_names
         }
 
+    def wait_healthy_ceph(self,
+                          ssh,
+                          node_names=None,
+                          time_sec=30):
+        ceph_health = ""
+        status = False
+
+        start_time = time.time()
+        while time.time() - start_time < time_sec and not status:
+            ceph_health = self.get_ceph_health(ssh, node_names)
+            status = all(["HEALTH_OK"
+                          in status
+                          for node, status
+                          in ceph_health.items()])
+            if status:
+                break
+            LOG.info("Retry getting ceph health because Ceph is unhealthy: {}"
+                     .format(ceph_health))
+            time.sleep(10)
+
+        error = "" if status \
+            else "Ceph health is not OK: {0}".format(ceph_health)
+        return status, error
+
     @pytest.mark.grab_versions
     @pytest.mark.restart_osd_node
     def test_restart_osd_node(
@@ -69,11 +95,9 @@
 
         Scenario:
         1. Find Ceph OSD nodes
-        2. Check Ceph cluster health before node restart (skipped until
-            PROD-31374 is fixed)
+        2. Check Ceph cluster health before node restart
         3. Restart 1 Ceph OSD node
-        4. Check Ceph cluster health after node restart (skipped until
-            PROD-31374 is fixed)
+        4. Check Ceph cluster health after node restart
         5. Run Tempest smoke test suite
         6. Run test_ceph_status.py::test_ceph_osd and
             test_services.py::test_check_services[osd] sanity tests
@@ -93,11 +117,9 @@
 
         # Check Ceph cluster health before node restart
         show_step(2)
-        ceph_health = self.get_ceph_health(ssh, osd_hosts)
-        # FIXME: uncomment the check once PROD-31374 is fixed
-        # status = all(
-        #     ["OK" in status for node, status in ceph_health.items()])
-        # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+        result, error = self.wait_healthy_ceph(ssh=ssh,
+                                               node_names=osd_hosts)
+        assert result, error
 
         # Restart a Ceph OSD node
         show_step(3)
@@ -118,11 +140,10 @@
 
         # Check Ceph cluster health after node restart
         show_step(4)
-        ceph_health = self.get_ceph_health(ssh, osd_hosts)  # noqa
-        # FIXME: uncomment the check once PROD-31374 is fixed
-        # status = all(
-        #     ["OK" in status for node, status in ceph_health.items()])
-        # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+        result, error = self.wait_healthy_ceph(ssh=ssh,
+                                               node_names=osd_hosts,
+                                               time_sec=120)
+        assert result, error
 
         # Run Tempest smoke test suite
         show_step(5)
@@ -165,11 +186,9 @@
 
         Scenario:
         1. Find Ceph CMN nodes
-        2. Check Ceph cluster health before node restart (skipped until
-            PROD-31374 is fixed)
+        2. Check Ceph cluster health before node restart
         3. Restart 1 Ceph CMN node
-        4. Check Ceph cluster health after node restart (skipped until
-            PROD-31374 is fixed)
+        4. Check Ceph cluster health after node restart
         5. Run Tempest smoke test suite
         6. Run test_ceph_status.py::test_ceph_replicas and
             test_services.py::test_check_services[cmn] sanity tests
@@ -189,11 +208,9 @@
 
         # Check Ceph cluster health before node restart
         show_step(2)
-        ceph_health = self.get_ceph_health(ssh, cmn_hosts)
-        # FIXME: uncomment the check once PROD-31374 is fixed
-        # status = all(
-        #     ["OK" in status for node, status in ceph_health.items()])
-        # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+        result, error = self.wait_healthy_ceph(ssh=ssh,
+                                               node_names=cmn_hosts)
+        assert result, error
 
         # Restart a Ceph CMN node
         show_step(3)
@@ -214,11 +231,10 @@
 
         # Check Ceph cluster health after node restart
         show_step(4)
-        ceph_health = self.get_ceph_health(ssh, cmn_hosts) # noqa
-        # FIXME: uncomment the check once PROD-31374 is fixed
-        # status = all(
-        #     ["OK" in status for node, status in ceph_health.items()])
-        # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+        result, error = self.wait_healthy_ceph(ssh=ssh,
+                                               node_names=cmn_hosts,
+                                               time_sec=120)
+        assert result, error
 
         # Run Tempest smoke test suite
         show_step(5)
@@ -261,11 +277,9 @@
 
         Scenario:
         1. Find Ceph RGW nodes
-        2. Check Ceph cluster health before node restart (skipped until
-            PROD-31374 is fixed)
+        2. Check Ceph cluster health before node restart
         3. Restart 1 Ceph RGW node
-        4. Check Ceph cluster health after node restart (skipped until
-            PROD-31374 is fixed)
+        4. Check Ceph cluster health after node restart
         5. Run Tempest smoke test suite
         6. Run test_services.py::test_check_services[rgw] sanity test
 
@@ -284,11 +298,9 @@
 
         # Check Ceph cluster health before node restart
         show_step(2)
-        ceph_health = self.get_ceph_health(ssh, rgw_hosts)
-        # FIXME: uncomment the check once PROD-31374 is fixed
-        # status = all(
-        #     ["OK" in status for node, status in ceph_health.items()])
-        # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+        result, error = self.wait_healthy_ceph(ssh=ssh,
+                                               node_names=rgw_hosts)
+        assert result, error
 
         # Restart a Ceph RGW node
         show_step(3)
@@ -309,11 +321,11 @@
 
         # Check Ceph cluster health after node restart
         show_step(4)
-        ceph_health = self.get_ceph_health(ssh, rgw_hosts) # noqa
-        # FIXME: uncomment the check once PROD-31374 is fixed
-        # status = all(
-        #     ["OK" in status for node, status in ceph_health.items()])
-        # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+        result, error = self.wait_healthy_ceph(ssh=ssh,
+                                               node_names=rgw_hosts,
+                                               time_sec=120)
+
+        assert result, error
 
         # Run Tempest smoke test suite
         show_step(5)
@@ -384,9 +396,9 @@
         # STEP #2
         show_step(2)
         # Get the ceph health output before restart
-        health_before = self.get_ceph_health(underlay, osd_node_names)
-        assert all(["OK" in p for n, p in health_before.items()]), (
-            "'Ceph health is not ok from node: {0}".format(health_before))
+        result, error = self.wait_healthy_ceph(ssh=underlay,
+                                               node_names=osd_node_names)
+        assert result, error
 
         # STEP #3
         show_step(3)
@@ -399,9 +411,10 @@
         # STEP #4
         show_step(4)
         # Get the ceph health output after restart
-        health_after = self.get_ceph_health(underlay, osd_node_names)
-        assert all(["OK" in p for n, p in health_before.items()]), (
-            "'Ceph health is not ok from node: {0}".format(health_after))
+        result, error = self.wait_healthy_ceph(ssh=underlay,
+                                               node_names=osd_node_names)
+
+        assert result, error
 
         rally.run_container()
 
@@ -451,9 +464,10 @@
         # STEP #2
         show_step(2)
         # Get the ceph health output before restart
-        health_before = self.get_ceph_health(underlay, cmn_node_names)
-        assert all(["OK" in p for n, p in health_before.items()]), (
-            "'Ceph health is not ok from node: {0}".format(health_before))
+        result, error = self.wait_healthy_ceph(ssh=underlay,
+                                               node_names=cmn_node_names)
+
+        assert result, error
 
         # STEP #3
         show_step(3)
@@ -466,9 +480,11 @@
         # STEP #4
         show_step(4)
         # Get the ceph health output after restart
-        health_after = self.get_ceph_health(underlay, cmn_node_names)
-        assert all(["OK" in p for n, p in health_before.items()]), (
-            "'Ceph health is not ok from node: {0}".format(health_after))
+        result, error = self.wait_healthy_ceph(ssh=underlay,
+                                               node_names=cmn_node_names,
+                                               time_sec=120)
+
+        assert result, error
 
         rally.run_container()
 
@@ -521,9 +537,9 @@
         # STEP #2
         show_step(2)
         # Get the ceph health output before restart
-        health_before = self.get_ceph_health(underlay, rgw_node_names)
-        assert all(["OK" in p for n, p in health_before.items()]), (
-            "'Ceph health is not ok from node: {0}".format(health_before))
+        result, error = self.wait_healthy_ceph(ssh=underlay,
+                                               node_names=rgw_node_names)
+        assert result, error
 
         # STEP #3
         show_step(3)
@@ -536,9 +552,10 @@
         # STEP #4
         show_step(4)
         # Get the ceph health output after restart
-        health_after = self.get_ceph_health(underlay, rgw_node_names)
-        assert all(["OK" in p for n, p in health_before.items()]), (
-            "'Ceph health is not ok from node: {0}".format(health_after))
+        result, error = self.wait_healthy_ceph(ssh=underlay,
+                                               node_names=rgw_node_names,
+                                               time_sec=120)
+        assert result, error
 
         rally.run_container()