Add HA cases for rabbitmq and galera - kill rabbitmq_server on random node - kill mysql server on node with maximum mysql connections Change-Id: I8cf1aab4a4213c339ddefcecaa86d9a80d52acdb

commit: 0f08d9a80a0c0a126fecbff2cc55424bfee3bd15 [log] [tgz]
author: Dennis Dmitriev <ddmitriev@mirantis.com> Tue Dec 19 02:27:59 2017 +0200
committer: Dennis Dmitriev <ddmitriev@mirantis.com> Tue Dec 19 02:27:59 2017 +0200
tree: 291835daf0f0774dd060aa003aefeb30276426b6
parent: b8115f53b733c0e91b127a99ac9c11736a7a09ea [diff]
diff --git a/tcp_tests/tests/system/conftest.py b/tcp_tests/tests/system/conftest.py
index ec3846d..64288ab 100644
--- a/tcp_tests/tests/system/conftest.py
+++ b/tcp_tests/tests/system/conftest.py

@@ -52,6 +52,7 @@
     # oss_fixtures
     'oss_actions',
     'oss_deployed',
+    'oss_sl_os_deployed',
     # decapod_fixtures
     'decapod_actions',
     'decapod_deployed',

diff --git a/tcp_tests/tests/system/test_failover_openstack_services.py b/tcp_tests/tests/system/test_failover_openstack_services.py
index 16b4a8c..37cff72 100644
--- a/tcp_tests/tests/system/test_failover_openstack_services.py
+++ b/tcp_tests/tests/system/test_failover_openstack_services.py

@@ -95,7 +95,8 @@
     @pytest.mark.fail_snapshot
     @pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
     def test_restart_keepalived(self, func_name, underlay, config,
-                                openstack_deployed, common_services_actions,
+                                openstack_deployed,  sl_os_deployed,
+                                common_services_actions,
                                 salt_actions, openstack_actions,
                                 rally, show_step):
         """Test restart keepalived on ctl* nodes
@@ -163,7 +164,8 @@
     @pytest.mark.fail_snapshot
     @pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
     def test_stop_keepalived(self, func_name, underlay, config,
-                             openstack_deployed, common_services_actions,
+                             openstack_deployed,  sl_os_deployed,
+                             common_services_actions,
                              salt_actions, openstack_actions,
                              rally, show_step):
         """Test stop keepalived on ctl node with VIP under load
@@ -249,7 +251,8 @@
     @pytest.mark.fail_snapshot
     @pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
     def test_kill_keepalived(self, func_name, underlay, config,
-                             openstack_deployed, common_services_actions,
+                             openstack_deployed,  sl_os_deployed,
+                             common_services_actions,
                              salt_actions, openstack_actions,
                              rally, show_step):
         """Test kill keepalived and haproxy on ctl node with VIP under load
@@ -341,6 +344,8 @@
         #        5. Check that SL sent a e-mail notification about the failed
         #        keepalived service, and then remove the VIP remaining
         #        on the node after killing keepalived.
+        #        Alternative: check prometheus alerts list on mon*:
+        #        curl http://localhost:15011/api/v1/alerts
 
         # Remove the VIP address manually because
         # the killed keepalived cannot do it
@@ -399,12 +404,13 @@
 
         for node_name, ps in ps_before.items():
             if node_name == new_minion_vip:
-                # Check that keepalived actually stopped on <minion_vip> node
+                # Check that haproxy has been actually restarted
+                # on <new_minion_vip> node
                 assert ps_after[node_name] and (ps != ps_after[node_name]), (
                     "Haproxy wasn't restarted on node {0}: {1}"
                     .format(node_name, ps_after[node_name]))
             else:
-                # Check that keepalived on other ctl nodes was not restarted
+                # Check that haproxy on other ctl nodes was not restarted
                 assert ps == ps_after[node_name], (
                    "Haproxy was restarted while it shouldn't on node {0}"
                    .format(node_name))
@@ -419,3 +425,178 @@
         assert not results['fail'], self.show_failed_msg(results['fail'])
 
         LOG.info("*************** DONE **************")
+
+    @pytest.mark.grab_versions
+    @pytest.mark.fail_snapshot
+    @pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
+    def test_kill_rabbit_galera(self, func_name, underlay, config,
+                                openstack_deployed, sl_os_deployed,
+                                common_services_actions,
+                                salt_actions, openstack_actions,
+                                rally, show_step):
+        """Test kill rabbitmq and galera on ctl node with VIP under load
+
+        Scenario:
+            1. Find controller minion id with VIP
+            2. Set rabbitmq_server to be killed on a random ctl node
+               in few minutes, TR case #3385677
+            3. Run rally task to generate load
+            4. Check that rabbitmq_server was killed on the ctl node with VIP
+            5. Find controller minion id with Galera which is receiving
+               connections
+            6. Set mysql server to be killed in few minutes, TR case #4753976
+            7. Run rally task to generate load
+            8. Check that mysql was killed and started again by systemd
+            9. Check galera cluster status and replication
+            10. Run tempest smoke after failover
+            11. Check tempest report for failed tests
+
+        Requiremets:
+            - Salt cluster
+            - OpenStack cluster
+        """
+        common_services_actions.check_keepalived_pillar()
+        salt = salt_actions
+
+        ctl_node_names = underlay.get_target_node_names(
+            target='ctl')
+
+        # Rabbitmq case
+        # STEP #1
+        show_step(1)
+        # Get the ps output with datetime of the process
+        ps_before = self.get_ps_time(
+            underlay, "rabbitmq_server", ctl_node_names)
+        assert all(["rabbitmq_server" in p for n, p in ps_before.items()]), (
+            "'rabbitmq_server' is not running on some nodes: {0}"
+            .format(ps_before))
+
+        ctl_vip_pillar = salt.get_pillar(
+            tgt="I@nova:controller:enabled:True",
+            pillar="_param:cluster_vip_address")[0]
+        vip = [vip for minion_id, vip in ctl_vip_pillar.items()][0]
+        ctl_minions = ctl_vip_pillar.keys()
+        minion_vip = common_services_actions.get_keepalived_vip_minion_id(vip)
+        LOG.info("VIP {0} is on {1}".format(vip, minion_vip))
+
+        # STEP #2
+        show_step(2)
+
+        ctl_minion = underlay.get_random_node(ctl_minions)
+        ctl_node_name = salt_actions.get_grains(
+            tgt=ctl_minion, grains='fqdn')[0][ctl_minion]
+        LOG.info("Scheduling to kill rabbitmq on the minion {0}"
+                 .format(ctl_minion))
+        underlay.delayed_call(
+            "salt '{0}' cmd.run 'killall -9 -u rabbitmq'".format(ctl_minion),
+            host=config.salt.salt_master_host,
+            delay_min=2,
+            delay_max=3)
+
+        LOG.info("'at -l':\n" + underlay.check_call(
+            "at -l", host=config.salt.salt_master_host)['stdout_str'])
+
+        # STEP #3
+        show_step(3)
+        # Run rally task with created task file
+        self.create_and_run_rally_load_task(
+            rally, times=60, concurrency=4, timeout=900)
+
+        # STEP #4
+        show_step(4)
+        ps_after = self.get_ps_time(underlay,
+                                    "rabbitmq_server",
+                                    ctl_node_names)
+
+        for node_name, ps in ps_before.items():
+            if node_name == ctl_node_name:
+                # Check that rabbitmq_server has been actually stopped
+                # on <minion_vip> node
+                assert not ps_after[node_name], (
+                    "'rabbitmq_server' was not stopped on node {0}"
+                    .format(minion_vip))
+            else:
+                # Check that rabbitmq_server on other ctl nodes
+                # was not restarted
+                assert ps == ps_after[node_name], (
+                   "'rabbitmq_server' was restarted while it shouldn't!")
+
+        # Mysql case
+        # STEP #5
+        show_step(5)
+        # At first, ensure that mysql is running on all controllers
+        ps_before = self.get_ps_time(
+            underlay, "mysqld", ctl_node_names)
+        assert all(["mysqld" in p for n, p in ps_before.items()]), (
+            "'mysqld' is not running on some nodes: {0}"
+            .format(ps_before))
+
+        # Check haproxy status on the node with VIP and find the mysql backend
+        # which is receiving the connections
+        haproxy_status = common_services_actions.get_haproxy_status(minion_vip)
+        mysql_status = haproxy_status['mysql_cluster']
+        mysql_tgt = ''
+        scur = 0
+        for svname in mysql_status.keys():
+            if svname == "FRONTEND" or svname == "BACKEND":
+                continue
+            snew = int(mysql_status[svname]['scur'])
+            if scur < snew:
+                scur = snew
+                mysql_tgt = svname + '*'
+        assert scur > 0, ("No sessions to 'mysql_cluster' haproxy backend on "
+                          "the node with VIP, something wrong with cluster.")
+
+        # STEP #6
+        show_step(6)
+        LOG.info("Scheduling to kill mysqld on the minion {0}"
+                 .format(ctl_minion))
+        underlay.delayed_call(
+            "salt '{0}' cmd.run 'killall -9 -u mysql'".format(mysql_tgt),
+            host=config.salt.salt_master_host,
+            delay_min=2,
+            delay_max=3)
+
+        LOG.info("'at -l':\n" + underlay.check_call(
+            "at -l", host=config.salt.salt_master_host)['stdout_str'])
+
+        # STEP #7
+        show_step(7)
+        # Run rally task with created task file
+        self.create_and_run_rally_load_task(
+            rally, times=60, concurrency=4, timeout=900)
+
+        # STEP #8
+        show_step(8)
+        ret = salt.service_status("I@nova:controller:enabled:True",
+                                  "mysql")
+        LOG.info(ret)
+        ps_after = self.get_ps_time(underlay, "mysqld", ctl_node_names)
+
+        for node_name, ps in ps_before.items():
+            if node_name == minion_vip:
+                # Check that mysql actually restarted on <minion_vip> node
+                assert ps_after[node_name] and (ps != ps_after[node_name]), (
+                    "Mysql wasn't restarted on node {0}: {1}"
+                    .format(node_name, ps_after[node_name]))
+            else:
+                # Check that Mysql on other ctl nodes was not restarted
+                assert ps == ps_after[node_name], (
+                   "Mysql was restarted while it shouldn't on node {0}"
+                   .format(node_name))
+
+        # STEP #9
+        show_step(9)
+        # TODO(ddmitriev): check galera cluster status and replication
+        # like it was checked in OSTF.
+
+        # STEP #10
+        show_step(10)
+        results = rally.run_tempest(pattern='set=smoke',
+                                    report_prefix=func_name,
+                                    timeout=1800)
+        # Step #11
+        show_step(11)
+        assert not results['fail'], self.show_failed_msg(results['fail'])
+
+        LOG.info("*************** DONE **************")
commit	0f08d9a80a0c0a126fecbff2cc55424bfee3bd15	[log] [tgz]
author	Dennis Dmitriev <ddmitriev@mirantis.com>	Tue Dec 19 02:27:59 2017 +0200
committer	Dennis Dmitriev <ddmitriev@mirantis.com>	Tue Dec 19 02:27:59 2017 +0200
tree	291835daf0f0774dd060aa003aefeb30276426b6
parent	b8115f53b733c0e91b127a99ac9c11736a7a09ea [diff]