Merge "Add HA cases for rabbitmq and galera"
diff --git a/tcp_tests/fixtures/oss_fixtures.py b/tcp_tests/fixtures/oss_fixtures.py
index a74313b..95bbc54 100644
--- a/tcp_tests/fixtures/oss_fixtures.py
+++ b/tcp_tests/fixtures/oss_fixtures.py
@@ -79,3 +79,18 @@
pass
return oss_actions
+
+
+@pytest.mark.revert_snapshot(ext.SNAPSHOT.oss_deployed)
+@pytest.fixture(scope='function')
+def oss_sl_os_deployed(revert_snapshot,
+ sl_os_deployed,
+ oss_deployed):
+ """Fixture to get or install SL and OpenStack services on environment
+
+ Uses fixtures openstack_deployed and sl_deployed, with 'sl_deployed'
+ top-level snapshot.
+
+ Returns SLManager instance object
+ """
+ return oss_deployed
diff --git a/tcp_tests/managers/common_services_manager.py b/tcp_tests/managers/common_services_manager.py
index 1e783a8..c62114d 100644
--- a/tcp_tests/managers/common_services_manager.py
+++ b/tcp_tests/managers/common_services_manager.py
@@ -198,3 +198,31 @@
LOG.debug("keepalived pillars check passed: {0}".format(vips))
return vips
+
+ def get_haproxy_status(self, tgt):
+ """Get haproxy status for all backends on a specified minion"""
+ cmd = ("echo 'show stat' | "
+ "socat 'UNIX-CONNECT:/run/haproxy/admin.sock' STDIO")
+ # Refresh grains first
+ res = self._salt.run_state(tgt, 'cmd.run', cmd)
+ output = res[0]['return'][0]
+ assert len(output.keys()) == 1, "Please specify a single minion in tgt"
+ minion_id = output.keys()[0]
+
+ haproxy_status = {}
+ for line in output[minion_id].splitlines():
+ if line.startswith("#"):
+ continue
+ status = line.split(",")
+ pxname = status[0]
+ svname = status[1]
+ if pxname not in haproxy_status:
+ haproxy_status[pxname] = {}
+ haproxy_status[pxname][svname] = {
+ 'scur': status[4], # sessions current
+ 'smax': status[5], # sessions max
+ 'status': status[17], # status: UP or DOWN
+ 'rate': status[33], # sessions rate
+ }
+ LOG.debug("Haproxy status: \n{0}".format(haproxy_status))
+ return haproxy_status
diff --git a/tcp_tests/managers/underlay_ssh_manager.py b/tcp_tests/managers/underlay_ssh_manager.py
index 2008fba..7d3da96 100644
--- a/tcp_tests/managers/underlay_ssh_manager.py
+++ b/tcp_tests/managers/underlay_ssh_manager.py
@@ -363,12 +363,13 @@
with self.remote(node_name=host) as remote:
remote.upload(source, destination)
- def get_random_node(self):
+ def get_random_node(self, node_names=None):
"""Get random node name
+ :param node_names: list of strings
:return: str, name of node
"""
- return random.choice(self.node_names())
+ return random.choice(node_names or self.node_names())
def yaml_editor(self, file_path, node_name=None, host=None,
address_pool=None):
diff --git a/tcp_tests/tests/system/conftest.py b/tcp_tests/tests/system/conftest.py
index ec3846d..64288ab 100644
--- a/tcp_tests/tests/system/conftest.py
+++ b/tcp_tests/tests/system/conftest.py
@@ -52,6 +52,7 @@
# oss_fixtures
'oss_actions',
'oss_deployed',
+ 'oss_sl_os_deployed',
# decapod_fixtures
'decapod_actions',
'decapod_deployed',
diff --git a/tcp_tests/tests/system/test_failover_openstack_services.py b/tcp_tests/tests/system/test_failover_openstack_services.py
index 16b4a8c..37cff72 100644
--- a/tcp_tests/tests/system/test_failover_openstack_services.py
+++ b/tcp_tests/tests/system/test_failover_openstack_services.py
@@ -95,7 +95,8 @@
@pytest.mark.fail_snapshot
@pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
def test_restart_keepalived(self, func_name, underlay, config,
- openstack_deployed, common_services_actions,
+ openstack_deployed, sl_os_deployed,
+ common_services_actions,
salt_actions, openstack_actions,
rally, show_step):
"""Test restart keepalived on ctl* nodes
@@ -163,7 +164,8 @@
@pytest.mark.fail_snapshot
@pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
def test_stop_keepalived(self, func_name, underlay, config,
- openstack_deployed, common_services_actions,
+ openstack_deployed, sl_os_deployed,
+ common_services_actions,
salt_actions, openstack_actions,
rally, show_step):
"""Test stop keepalived on ctl node with VIP under load
@@ -249,7 +251,8 @@
@pytest.mark.fail_snapshot
@pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
def test_kill_keepalived(self, func_name, underlay, config,
- openstack_deployed, common_services_actions,
+ openstack_deployed, sl_os_deployed,
+ common_services_actions,
salt_actions, openstack_actions,
rally, show_step):
"""Test kill keepalived and haproxy on ctl node with VIP under load
@@ -341,6 +344,8 @@
# 5. Check that SL sent a e-mail notification about the failed
# keepalived service, and then remove the VIP remaining
# on the node after killing keepalived.
+ # Alternative: check prometheus alerts list on mon*:
+ # curl http://localhost:15011/api/v1/alerts
# Remove the VIP address manually because
# the killed keepalived cannot do it
@@ -399,12 +404,13 @@
for node_name, ps in ps_before.items():
if node_name == new_minion_vip:
- # Check that keepalived actually stopped on <minion_vip> node
+ # Check that haproxy has been actually restarted
+ # on <new_minion_vip> node
assert ps_after[node_name] and (ps != ps_after[node_name]), (
"Haproxy wasn't restarted on node {0}: {1}"
.format(node_name, ps_after[node_name]))
else:
- # Check that keepalived on other ctl nodes was not restarted
+ # Check that haproxy on other ctl nodes was not restarted
assert ps == ps_after[node_name], (
"Haproxy was restarted while it shouldn't on node {0}"
.format(node_name))
@@ -419,3 +425,178 @@
assert not results['fail'], self.show_failed_msg(results['fail'])
LOG.info("*************** DONE **************")
+
+ @pytest.mark.grab_versions
+ @pytest.mark.fail_snapshot
+ @pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
+ def test_kill_rabbit_galera(self, func_name, underlay, config,
+ openstack_deployed, sl_os_deployed,
+ common_services_actions,
+ salt_actions, openstack_actions,
+ rally, show_step):
+ """Test kill rabbitmq and galera on ctl node with VIP under load
+
+ Scenario:
+ 1. Find controller minion id with VIP
+ 2. Set rabbitmq_server to be killed on a random ctl node
+ in few minutes, TR case #3385677
+ 3. Run rally task to generate load
+ 4. Check that rabbitmq_server was killed on the ctl node with VIP
+ 5. Find controller minion id with Galera which is receiving
+ connections
+ 6. Set mysql server to be killed in few minutes, TR case #4753976
+ 7. Run rally task to generate load
+ 8. Check that mysql was killed and started again by systemd
+ 9. Check galera cluster status and replication
+ 10. Run tempest smoke after failover
+ 11. Check tempest report for failed tests
+
+ Requiremets:
+ - Salt cluster
+ - OpenStack cluster
+ """
+ common_services_actions.check_keepalived_pillar()
+ salt = salt_actions
+
+ ctl_node_names = underlay.get_target_node_names(
+ target='ctl')
+
+ # Rabbitmq case
+ # STEP #1
+ show_step(1)
+ # Get the ps output with datetime of the process
+ ps_before = self.get_ps_time(
+ underlay, "rabbitmq_server", ctl_node_names)
+ assert all(["rabbitmq_server" in p for n, p in ps_before.items()]), (
+ "'rabbitmq_server' is not running on some nodes: {0}"
+ .format(ps_before))
+
+ ctl_vip_pillar = salt.get_pillar(
+ tgt="I@nova:controller:enabled:True",
+ pillar="_param:cluster_vip_address")[0]
+ vip = [vip for minion_id, vip in ctl_vip_pillar.items()][0]
+ ctl_minions = ctl_vip_pillar.keys()
+ minion_vip = common_services_actions.get_keepalived_vip_minion_id(vip)
+ LOG.info("VIP {0} is on {1}".format(vip, minion_vip))
+
+ # STEP #2
+ show_step(2)
+
+ ctl_minion = underlay.get_random_node(ctl_minions)
+ ctl_node_name = salt_actions.get_grains(
+ tgt=ctl_minion, grains='fqdn')[0][ctl_minion]
+ LOG.info("Scheduling to kill rabbitmq on the minion {0}"
+ .format(ctl_minion))
+ underlay.delayed_call(
+ "salt '{0}' cmd.run 'killall -9 -u rabbitmq'".format(ctl_minion),
+ host=config.salt.salt_master_host,
+ delay_min=2,
+ delay_max=3)
+
+ LOG.info("'at -l':\n" + underlay.check_call(
+ "at -l", host=config.salt.salt_master_host)['stdout_str'])
+
+ # STEP #3
+ show_step(3)
+ # Run rally task with created task file
+ self.create_and_run_rally_load_task(
+ rally, times=60, concurrency=4, timeout=900)
+
+ # STEP #4
+ show_step(4)
+ ps_after = self.get_ps_time(underlay,
+ "rabbitmq_server",
+ ctl_node_names)
+
+ for node_name, ps in ps_before.items():
+ if node_name == ctl_node_name:
+ # Check that rabbitmq_server has been actually stopped
+ # on <minion_vip> node
+ assert not ps_after[node_name], (
+ "'rabbitmq_server' was not stopped on node {0}"
+ .format(minion_vip))
+ else:
+ # Check that rabbitmq_server on other ctl nodes
+ # was not restarted
+ assert ps == ps_after[node_name], (
+ "'rabbitmq_server' was restarted while it shouldn't!")
+
+ # Mysql case
+ # STEP #5
+ show_step(5)
+ # At first, ensure that mysql is running on all controllers
+ ps_before = self.get_ps_time(
+ underlay, "mysqld", ctl_node_names)
+ assert all(["mysqld" in p for n, p in ps_before.items()]), (
+ "'mysqld' is not running on some nodes: {0}"
+ .format(ps_before))
+
+ # Check haproxy status on the node with VIP and find the mysql backend
+ # which is receiving the connections
+ haproxy_status = common_services_actions.get_haproxy_status(minion_vip)
+ mysql_status = haproxy_status['mysql_cluster']
+ mysql_tgt = ''
+ scur = 0
+ for svname in mysql_status.keys():
+ if svname == "FRONTEND" or svname == "BACKEND":
+ continue
+ snew = int(mysql_status[svname]['scur'])
+ if scur < snew:
+ scur = snew
+ mysql_tgt = svname + '*'
+ assert scur > 0, ("No sessions to 'mysql_cluster' haproxy backend on "
+ "the node with VIP, something wrong with cluster.")
+
+ # STEP #6
+ show_step(6)
+ LOG.info("Scheduling to kill mysqld on the minion {0}"
+ .format(ctl_minion))
+ underlay.delayed_call(
+ "salt '{0}' cmd.run 'killall -9 -u mysql'".format(mysql_tgt),
+ host=config.salt.salt_master_host,
+ delay_min=2,
+ delay_max=3)
+
+ LOG.info("'at -l':\n" + underlay.check_call(
+ "at -l", host=config.salt.salt_master_host)['stdout_str'])
+
+ # STEP #7
+ show_step(7)
+ # Run rally task with created task file
+ self.create_and_run_rally_load_task(
+ rally, times=60, concurrency=4, timeout=900)
+
+ # STEP #8
+ show_step(8)
+ ret = salt.service_status("I@nova:controller:enabled:True",
+ "mysql")
+ LOG.info(ret)
+ ps_after = self.get_ps_time(underlay, "mysqld", ctl_node_names)
+
+ for node_name, ps in ps_before.items():
+ if node_name == minion_vip:
+ # Check that mysql actually restarted on <minion_vip> node
+ assert ps_after[node_name] and (ps != ps_after[node_name]), (
+ "Mysql wasn't restarted on node {0}: {1}"
+ .format(node_name, ps_after[node_name]))
+ else:
+ # Check that Mysql on other ctl nodes was not restarted
+ assert ps == ps_after[node_name], (
+ "Mysql was restarted while it shouldn't on node {0}"
+ .format(node_name))
+
+ # STEP #9
+ show_step(9)
+ # TODO(ddmitriev): check galera cluster status and replication
+ # like it was checked in OSTF.
+
+ # STEP #10
+ show_step(10)
+ results = rally.run_tempest(pattern='set=smoke',
+ report_prefix=func_name,
+ timeout=1800)
+ # Step #11
+ show_step(11)
+ assert not results['fail'], self.show_failed_msg(results['fail'])
+
+ LOG.info("*************** DONE **************")