Review tests
Add restart salt-minion in add_ceph_node tests
Return the Ceph health check back in ceph_failover tests
Wait for healthy CEPH after each node restart in ceph_failover tests
Change cvp-sanity and tempest parameters for ceph_failover tests
Add JJB template for Ceph Failover tests
Fix parameters to start SaltMaster backup/restore tests
PROD-36643
Change-Id: I52017158d07373d7cb90846e42edb4276e385552
diff --git a/tcp_tests/tests/system/test_ceph_operations.py b/tcp_tests/tests/system/test_ceph_operations.py
index b2f98b0..55791ca 100644
--- a/tcp_tests/tests/system/test_ceph_operations.py
+++ b/tcp_tests/tests/system/test_ceph_operations.py
@@ -1,3 +1,5 @@
+import time
+
import pytest
from tcp_tests import logger
@@ -27,10 +29,11 @@
node_name=cfg_node,
raise_on_err=False)
# Need to restart salt-minion service after accepting it in Salt Master
- # underlay_actions.check_call(
- # "systemctl restart salt-minion",
- # node_name=xtra_node,
- # raise_on_err=False)
+ underlay_actions.check_call(
+ "systemctl restart salt-minion",
+ node_name=xtra_node,
+ raise_on_err=False)
+ time.sleep(15)
# salt_actions.enforce_state("xtra*", "linux")
# salt_actions.enforce_state("xtra*", "openssh")
@@ -469,6 +472,7 @@
#OSDSETTINGS
#MONSETTINGS
#RGWSETTINGS
+ #MGRSETTINGS
linux_network_interfaces:
br_ctl:
address: ${_param:ceph_#NODE_node04_address}
@@ -499,6 +503,7 @@
'OSDSETTINGS': '',
'MONSETTINGS': '',
'RGWSETTINGS': '',
+ 'MGRSETTINGS': '',
}
# # ------------------OSD specific settings ----------
@@ -521,6 +526,10 @@
keepalived_vip_priority: 104
""" # noqa: E501
+ # # ------------------MGR specific settings -----------
+ if node == 'mgr':
+ data['MGRSETTINGS'] = ""
+
yaml_config = template.substitute(data)
return yaml_config
diff --git a/tcp_tests/tests/system/test_failover_ceph.py b/tcp_tests/tests/system/test_failover_ceph.py
index a89d711..02d7d28 100644
--- a/tcp_tests/tests/system/test_failover_ceph.py
+++ b/tcp_tests/tests/system/test_failover_ceph.py
@@ -13,6 +13,7 @@
# under the License.
import pytest
+import time
from devops.helpers import helpers
from tcp_tests import logger
@@ -33,13 +34,14 @@
'EXTRA_PARAMS': {
'envs': [
"tests_set=-k "
- "'not test_ceph_health and not test_prometheus_alert_count'"
+ "'not salt_master and not test_ceph_health and not "
+ "test_prometheus_alert_count'"
]
}
}
JENKINS_START_TIMEOUT = 60
- JENKINS_BUILD_TIMEOUT = 60 * 15
+ JENKINS_BUILD_TIMEOUT = 60 * 25
def get_ceph_health(self, ssh, node_names):
"""Get Ceph health status on specified nodes
@@ -51,12 +53,36 @@
"""
return {
node_name: ssh.check_call(
- "ceph -s",
+ "ceph health",
node_name=node_name,
raise_on_err=False)['stdout_str']
for node_name in node_names
}
+ def wait_healthy_ceph(self,
+ ssh,
+ node_names=None,
+ time_sec=30):
+ ceph_health = ""
+ status = False
+
+ start_time = time.time()
+ while time.time() - start_time < time_sec and not status:
+ ceph_health = self.get_ceph_health(ssh, node_names)
+ status = all(["HEALTH_OK"
+ in status
+ for node, status
+ in ceph_health.items()])
+ if status:
+ break
+ LOG.info("Retry getting ceph health because Ceph is unhealthy: {}"
+ .format(ceph_health))
+ time.sleep(10)
+
+ error = "" if status \
+ else "Ceph health is not OK: {0}".format(ceph_health)
+ return status, error
+
@pytest.mark.grab_versions
@pytest.mark.restart_osd_node
def test_restart_osd_node(
@@ -69,11 +95,9 @@
Scenario:
1. Find Ceph OSD nodes
- 2. Check Ceph cluster health before node restart (skipped until
- PROD-31374 is fixed)
+ 2. Check Ceph cluster health before node restart
3. Restart 1 Ceph OSD node
- 4. Check Ceph cluster health after node restart (skipped until
- PROD-31374 is fixed)
+ 4. Check Ceph cluster health after node restart
5. Run Tempest smoke test suite
6. Run test_ceph_status.py::test_ceph_osd and
test_services.py::test_check_services[osd] sanity tests
@@ -93,11 +117,9 @@
# Check Ceph cluster health before node restart
show_step(2)
- ceph_health = self.get_ceph_health(ssh, osd_hosts)
- # FIXME: uncomment the check once PROD-31374 is fixed
- # status = all(
- # ["OK" in status for node, status in ceph_health.items()])
- # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+ result, error = self.wait_healthy_ceph(ssh=ssh,
+ node_names=osd_hosts)
+ assert result, error
# Restart a Ceph OSD node
show_step(3)
@@ -118,11 +140,10 @@
# Check Ceph cluster health after node restart
show_step(4)
- ceph_health = self.get_ceph_health(ssh, osd_hosts) # noqa
- # FIXME: uncomment the check once PROD-31374 is fixed
- # status = all(
- # ["OK" in status for node, status in ceph_health.items()])
- # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+ result, error = self.wait_healthy_ceph(ssh=ssh,
+ node_names=osd_hosts,
+ time_sec=120)
+ assert result, error
# Run Tempest smoke test suite
show_step(5)
@@ -165,11 +186,9 @@
Scenario:
1. Find Ceph CMN nodes
- 2. Check Ceph cluster health before node restart (skipped until
- PROD-31374 is fixed)
+ 2. Check Ceph cluster health before node restart
3. Restart 1 Ceph CMN node
- 4. Check Ceph cluster health after node restart (skipped until
- PROD-31374 is fixed)
+ 4. Check Ceph cluster health after node restart
5. Run Tempest smoke test suite
6. Run test_ceph_status.py::test_ceph_replicas and
test_services.py::test_check_services[cmn] sanity tests
@@ -189,11 +208,9 @@
# Check Ceph cluster health before node restart
show_step(2)
- ceph_health = self.get_ceph_health(ssh, cmn_hosts)
- # FIXME: uncomment the check once PROD-31374 is fixed
- # status = all(
- # ["OK" in status for node, status in ceph_health.items()])
- # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+ result, error = self.wait_healthy_ceph(ssh=ssh,
+ node_names=cmn_hosts)
+ assert result, error
# Restart a Ceph CMN node
show_step(3)
@@ -214,11 +231,10 @@
# Check Ceph cluster health after node restart
show_step(4)
- ceph_health = self.get_ceph_health(ssh, cmn_hosts) # noqa
- # FIXME: uncomment the check once PROD-31374 is fixed
- # status = all(
- # ["OK" in status for node, status in ceph_health.items()])
- # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+ result, error = self.wait_healthy_ceph(ssh=ssh,
+ node_names=cmn_hosts,
+ time_sec=120)
+ assert result, error
# Run Tempest smoke test suite
show_step(5)
@@ -261,11 +277,9 @@
Scenario:
1. Find Ceph RGW nodes
- 2. Check Ceph cluster health before node restart (skipped until
- PROD-31374 is fixed)
+ 2. Check Ceph cluster health before node restart
3. Restart 1 Ceph RGW node
- 4. Check Ceph cluster health after node restart (skipped until
- PROD-31374 is fixed)
+ 4. Check Ceph cluster health after node restart
5. Run Tempest smoke test suite
6. Run test_services.py::test_check_services[rgw] sanity test
@@ -284,11 +298,9 @@
# Check Ceph cluster health before node restart
show_step(2)
- ceph_health = self.get_ceph_health(ssh, rgw_hosts)
- # FIXME: uncomment the check once PROD-31374 is fixed
- # status = all(
- # ["OK" in status for node, status in ceph_health.items()])
- # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+ result, error = self.wait_healthy_ceph(ssh=ssh,
+ node_names=rgw_hosts)
+ assert result, error
# Restart a Ceph RGW node
show_step(3)
@@ -309,11 +321,11 @@
# Check Ceph cluster health after node restart
show_step(4)
- ceph_health = self.get_ceph_health(ssh, rgw_hosts) # noqa
- # FIXME: uncomment the check once PROD-31374 is fixed
- # status = all(
- # ["OK" in status for node, status in ceph_health.items()])
- # assert status, "Ceph health is not OK: {0}".format(ceph_health)
+ result, error = self.wait_healthy_ceph(ssh=ssh,
+ node_names=rgw_hosts,
+ time_sec=120)
+
+ assert result, error
# Run Tempest smoke test suite
show_step(5)
@@ -384,9 +396,9 @@
# STEP #2
show_step(2)
# Get the ceph health output before restart
- health_before = self.get_ceph_health(underlay, osd_node_names)
- assert all(["OK" in p for n, p in health_before.items()]), (
- "'Ceph health is not ok from node: {0}".format(health_before))
+ result, error = self.wait_healthy_ceph(ssh=underlay,
+ node_names=osd_node_names)
+ assert result, error
# STEP #3
show_step(3)
@@ -399,9 +411,10 @@
# STEP #4
show_step(4)
# Get the ceph health output after restart
- health_after = self.get_ceph_health(underlay, osd_node_names)
- assert all(["OK" in p for n, p in health_before.items()]), (
- "'Ceph health is not ok from node: {0}".format(health_after))
+ result, error = self.wait_healthy_ceph(ssh=underlay,
+ node_names=osd_node_names)
+
+ assert result, error
rally.run_container()
@@ -451,9 +464,10 @@
# STEP #2
show_step(2)
# Get the ceph health output before restart
- health_before = self.get_ceph_health(underlay, cmn_node_names)
- assert all(["OK" in p for n, p in health_before.items()]), (
- "'Ceph health is not ok from node: {0}".format(health_before))
+ result, error = self.wait_healthy_ceph(ssh=underlay,
+ node_names=cmn_node_names)
+
+ assert result, error
# STEP #3
show_step(3)
@@ -466,9 +480,11 @@
# STEP #4
show_step(4)
# Get the ceph health output after restart
- health_after = self.get_ceph_health(underlay, cmn_node_names)
- assert all(["OK" in p for n, p in health_before.items()]), (
- "'Ceph health is not ok from node: {0}".format(health_after))
+ result, error = self.wait_healthy_ceph(ssh=underlay,
+ node_names=cmn_node_names,
+ time_sec=120)
+
+ assert result, error
rally.run_container()
@@ -521,9 +537,9 @@
# STEP #2
show_step(2)
# Get the ceph health output before restart
- health_before = self.get_ceph_health(underlay, rgw_node_names)
- assert all(["OK" in p for n, p in health_before.items()]), (
- "'Ceph health is not ok from node: {0}".format(health_before))
+ result, error = self.wait_healthy_ceph(ssh=underlay,
+ node_names=rgw_node_names)
+ assert result, error
# STEP #3
show_step(3)
@@ -536,9 +552,10 @@
# STEP #4
show_step(4)
# Get the ceph health output after restart
- health_after = self.get_ceph_health(underlay, rgw_node_names)
- assert all(["OK" in p for n, p in health_before.items()]), (
- "'Ceph health is not ok from node: {0}".format(health_after))
+ result, error = self.wait_healthy_ceph(ssh=underlay,
+ node_names=rgw_node_names,
+ time_sec=120)
+ assert result, error
rally.run_container()