Add HA cases for rabbitmq and galera
- kill rabbitmq_server on random node
- kill mysql server on node with maximum mysql connections
Change-Id: I8cf1aab4a4213c339ddefcecaa86d9a80d52acdb
diff --git a/tcp_tests/tests/system/conftest.py b/tcp_tests/tests/system/conftest.py
index ec3846d..64288ab 100644
--- a/tcp_tests/tests/system/conftest.py
+++ b/tcp_tests/tests/system/conftest.py
@@ -52,6 +52,7 @@
# oss_fixtures
'oss_actions',
'oss_deployed',
+ 'oss_sl_os_deployed',
# decapod_fixtures
'decapod_actions',
'decapod_deployed',
diff --git a/tcp_tests/tests/system/test_failover_openstack_services.py b/tcp_tests/tests/system/test_failover_openstack_services.py
index 16b4a8c..37cff72 100644
--- a/tcp_tests/tests/system/test_failover_openstack_services.py
+++ b/tcp_tests/tests/system/test_failover_openstack_services.py
@@ -95,7 +95,8 @@
@pytest.mark.fail_snapshot
@pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
def test_restart_keepalived(self, func_name, underlay, config,
- openstack_deployed, common_services_actions,
+ openstack_deployed, sl_os_deployed,
+ common_services_actions,
salt_actions, openstack_actions,
rally, show_step):
"""Test restart keepalived on ctl* nodes
@@ -163,7 +164,8 @@
@pytest.mark.fail_snapshot
@pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
def test_stop_keepalived(self, func_name, underlay, config,
- openstack_deployed, common_services_actions,
+ openstack_deployed, sl_os_deployed,
+ common_services_actions,
salt_actions, openstack_actions,
rally, show_step):
"""Test stop keepalived on ctl node with VIP under load
@@ -249,7 +251,8 @@
@pytest.mark.fail_snapshot
@pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
def test_kill_keepalived(self, func_name, underlay, config,
- openstack_deployed, common_services_actions,
+ openstack_deployed, sl_os_deployed,
+ common_services_actions,
salt_actions, openstack_actions,
rally, show_step):
"""Test kill keepalived and haproxy on ctl node with VIP under load
@@ -341,6 +344,8 @@
# 5. Check that SL sent a e-mail notification about the failed
# keepalived service, and then remove the VIP remaining
# on the node after killing keepalived.
+ # Alternative: check prometheus alerts list on mon*:
+ # curl http://localhost:15011/api/v1/alerts
# Remove the VIP address manually because
# the killed keepalived cannot do it
@@ -399,12 +404,13 @@
for node_name, ps in ps_before.items():
if node_name == new_minion_vip:
- # Check that keepalived actually stopped on <minion_vip> node
+ # Check that haproxy has been actually restarted
+ # on <new_minion_vip> node
assert ps_after[node_name] and (ps != ps_after[node_name]), (
"Haproxy wasn't restarted on node {0}: {1}"
.format(node_name, ps_after[node_name]))
else:
- # Check that keepalived on other ctl nodes was not restarted
+ # Check that haproxy on other ctl nodes was not restarted
assert ps == ps_after[node_name], (
"Haproxy was restarted while it shouldn't on node {0}"
.format(node_name))
@@ -419,3 +425,178 @@
assert not results['fail'], self.show_failed_msg(results['fail'])
LOG.info("*************** DONE **************")
+
+ @pytest.mark.grab_versions
+ @pytest.mark.fail_snapshot
+ @pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
+ def test_kill_rabbit_galera(self, func_name, underlay, config,
+ openstack_deployed, sl_os_deployed,
+ common_services_actions,
+ salt_actions, openstack_actions,
+ rally, show_step):
+ """Test kill rabbitmq and galera on ctl node with VIP under load
+
+ Scenario:
+ 1. Find controller minion id with VIP
+ 2. Set rabbitmq_server to be killed on a random ctl node
+ in few minutes, TR case #3385677
+ 3. Run rally task to generate load
+ 4. Check that rabbitmq_server was killed on the ctl node with VIP
+ 5. Find controller minion id with Galera which is receiving
+ connections
+ 6. Set mysql server to be killed in few minutes, TR case #4753976
+ 7. Run rally task to generate load
+ 8. Check that mysql was killed and started again by systemd
+ 9. Check galera cluster status and replication
+ 10. Run tempest smoke after failover
+ 11. Check tempest report for failed tests
+
+ Requiremets:
+ - Salt cluster
+ - OpenStack cluster
+ """
+ common_services_actions.check_keepalived_pillar()
+ salt = salt_actions
+
+ ctl_node_names = underlay.get_target_node_names(
+ target='ctl')
+
+ # Rabbitmq case
+ # STEP #1
+ show_step(1)
+ # Get the ps output with datetime of the process
+ ps_before = self.get_ps_time(
+ underlay, "rabbitmq_server", ctl_node_names)
+ assert all(["rabbitmq_server" in p for n, p in ps_before.items()]), (
+ "'rabbitmq_server' is not running on some nodes: {0}"
+ .format(ps_before))
+
+ ctl_vip_pillar = salt.get_pillar(
+ tgt="I@nova:controller:enabled:True",
+ pillar="_param:cluster_vip_address")[0]
+ vip = [vip for minion_id, vip in ctl_vip_pillar.items()][0]
+ ctl_minions = ctl_vip_pillar.keys()
+ minion_vip = common_services_actions.get_keepalived_vip_minion_id(vip)
+ LOG.info("VIP {0} is on {1}".format(vip, minion_vip))
+
+ # STEP #2
+ show_step(2)
+
+ ctl_minion = underlay.get_random_node(ctl_minions)
+ ctl_node_name = salt_actions.get_grains(
+ tgt=ctl_minion, grains='fqdn')[0][ctl_minion]
+ LOG.info("Scheduling to kill rabbitmq on the minion {0}"
+ .format(ctl_minion))
+ underlay.delayed_call(
+ "salt '{0}' cmd.run 'killall -9 -u rabbitmq'".format(ctl_minion),
+ host=config.salt.salt_master_host,
+ delay_min=2,
+ delay_max=3)
+
+ LOG.info("'at -l':\n" + underlay.check_call(
+ "at -l", host=config.salt.salt_master_host)['stdout_str'])
+
+ # STEP #3
+ show_step(3)
+ # Run rally task with created task file
+ self.create_and_run_rally_load_task(
+ rally, times=60, concurrency=4, timeout=900)
+
+ # STEP #4
+ show_step(4)
+ ps_after = self.get_ps_time(underlay,
+ "rabbitmq_server",
+ ctl_node_names)
+
+ for node_name, ps in ps_before.items():
+ if node_name == ctl_node_name:
+ # Check that rabbitmq_server has been actually stopped
+ # on <minion_vip> node
+ assert not ps_after[node_name], (
+ "'rabbitmq_server' was not stopped on node {0}"
+ .format(minion_vip))
+ else:
+ # Check that rabbitmq_server on other ctl nodes
+ # was not restarted
+ assert ps == ps_after[node_name], (
+ "'rabbitmq_server' was restarted while it shouldn't!")
+
+ # Mysql case
+ # STEP #5
+ show_step(5)
+ # At first, ensure that mysql is running on all controllers
+ ps_before = self.get_ps_time(
+ underlay, "mysqld", ctl_node_names)
+ assert all(["mysqld" in p for n, p in ps_before.items()]), (
+ "'mysqld' is not running on some nodes: {0}"
+ .format(ps_before))
+
+ # Check haproxy status on the node with VIP and find the mysql backend
+ # which is receiving the connections
+ haproxy_status = common_services_actions.get_haproxy_status(minion_vip)
+ mysql_status = haproxy_status['mysql_cluster']
+ mysql_tgt = ''
+ scur = 0
+ for svname in mysql_status.keys():
+ if svname == "FRONTEND" or svname == "BACKEND":
+ continue
+ snew = int(mysql_status[svname]['scur'])
+ if scur < snew:
+ scur = snew
+ mysql_tgt = svname + '*'
+ assert scur > 0, ("No sessions to 'mysql_cluster' haproxy backend on "
+ "the node with VIP, something wrong with cluster.")
+
+ # STEP #6
+ show_step(6)
+ LOG.info("Scheduling to kill mysqld on the minion {0}"
+ .format(ctl_minion))
+ underlay.delayed_call(
+ "salt '{0}' cmd.run 'killall -9 -u mysql'".format(mysql_tgt),
+ host=config.salt.salt_master_host,
+ delay_min=2,
+ delay_max=3)
+
+ LOG.info("'at -l':\n" + underlay.check_call(
+ "at -l", host=config.salt.salt_master_host)['stdout_str'])
+
+ # STEP #7
+ show_step(7)
+ # Run rally task with created task file
+ self.create_and_run_rally_load_task(
+ rally, times=60, concurrency=4, timeout=900)
+
+ # STEP #8
+ show_step(8)
+ ret = salt.service_status("I@nova:controller:enabled:True",
+ "mysql")
+ LOG.info(ret)
+ ps_after = self.get_ps_time(underlay, "mysqld", ctl_node_names)
+
+ for node_name, ps in ps_before.items():
+ if node_name == minion_vip:
+ # Check that mysql actually restarted on <minion_vip> node
+ assert ps_after[node_name] and (ps != ps_after[node_name]), (
+ "Mysql wasn't restarted on node {0}: {1}"
+ .format(node_name, ps_after[node_name]))
+ else:
+ # Check that Mysql on other ctl nodes was not restarted
+ assert ps == ps_after[node_name], (
+ "Mysql was restarted while it shouldn't on node {0}"
+ .format(node_name))
+
+ # STEP #9
+ show_step(9)
+ # TODO(ddmitriev): check galera cluster status and replication
+ # like it was checked in OSTF.
+
+ # STEP #10
+ show_step(10)
+ results = rally.run_tempest(pattern='set=smoke',
+ report_prefix=func_name,
+ timeout=1800)
+ # Step #11
+ show_step(11)
+ assert not results['fail'], self.show_failed_msg(results['fail'])
+
+ LOG.info("*************** DONE **************")