Merge "Add HA cases for rabbitmq and galera"
diff --git a/tcp_tests/fixtures/oss_fixtures.py b/tcp_tests/fixtures/oss_fixtures.py
index a74313b..95bbc54 100644
--- a/tcp_tests/fixtures/oss_fixtures.py
+++ b/tcp_tests/fixtures/oss_fixtures.py
@@ -79,3 +79,18 @@
         pass
 
     return oss_actions
+
+
+@pytest.mark.revert_snapshot(ext.SNAPSHOT.oss_deployed)
+@pytest.fixture(scope='function')
+def oss_sl_os_deployed(revert_snapshot,
+                       sl_os_deployed,
+                       oss_deployed):
+    """Fixture to get or install SL and OpenStack services on environment
+
+    Uses fixtures openstack_deployed and sl_deployed, with 'sl_deployed'
+    top-level snapshot.
+
+    Returns SLManager instance object
+    """
+    return oss_deployed
diff --git a/tcp_tests/managers/common_services_manager.py b/tcp_tests/managers/common_services_manager.py
index 1e783a8..c62114d 100644
--- a/tcp_tests/managers/common_services_manager.py
+++ b/tcp_tests/managers/common_services_manager.py
@@ -198,3 +198,31 @@
 
         LOG.debug("keepalived pillars check passed: {0}".format(vips))
         return vips
+
+    def get_haproxy_status(self, tgt):
+        """Get haproxy status for all backends on a specified minion"""
+        cmd = ("echo 'show stat' | "
+               "socat 'UNIX-CONNECT:/run/haproxy/admin.sock' STDIO")
+        # Refresh grains first
+        res = self._salt.run_state(tgt, 'cmd.run', cmd)
+        output = res[0]['return'][0]
+        assert len(output.keys()) == 1, "Please specify a single minion in tgt"
+        minion_id = output.keys()[0]
+
+        haproxy_status = {}
+        for line in output[minion_id].splitlines():
+            if line.startswith("#"):
+                continue
+            status = line.split(",")
+            pxname = status[0]
+            svname = status[1]
+            if pxname not in haproxy_status:
+                haproxy_status[pxname] = {}
+            haproxy_status[pxname][svname] = {
+                'scur': status[4],     # sessions current
+                'smax': status[5],     # sessions max
+                'status': status[17],  # status: UP or DOWN
+                'rate': status[33],    # sessions rate
+            }
+        LOG.debug("Haproxy status: \n{0}".format(haproxy_status))
+        return haproxy_status
diff --git a/tcp_tests/managers/underlay_ssh_manager.py b/tcp_tests/managers/underlay_ssh_manager.py
index 2008fba..7d3da96 100644
--- a/tcp_tests/managers/underlay_ssh_manager.py
+++ b/tcp_tests/managers/underlay_ssh_manager.py
@@ -363,12 +363,13 @@
         with self.remote(node_name=host) as remote:
             remote.upload(source, destination)
 
-    def get_random_node(self):
+    def get_random_node(self, node_names=None):
         """Get random node name
 
+        :param node_names: list of strings
         :return: str, name of node
         """
-        return random.choice(self.node_names())
+        return random.choice(node_names or self.node_names())
 
     def yaml_editor(self, file_path, node_name=None, host=None,
                     address_pool=None):
diff --git a/tcp_tests/tests/system/conftest.py b/tcp_tests/tests/system/conftest.py
index ec3846d..64288ab 100644
--- a/tcp_tests/tests/system/conftest.py
+++ b/tcp_tests/tests/system/conftest.py
@@ -52,6 +52,7 @@
     # oss_fixtures
     'oss_actions',
     'oss_deployed',
+    'oss_sl_os_deployed',
     # decapod_fixtures
     'decapod_actions',
     'decapod_deployed',
diff --git a/tcp_tests/tests/system/test_failover_openstack_services.py b/tcp_tests/tests/system/test_failover_openstack_services.py
index 16b4a8c..37cff72 100644
--- a/tcp_tests/tests/system/test_failover_openstack_services.py
+++ b/tcp_tests/tests/system/test_failover_openstack_services.py
@@ -95,7 +95,8 @@
     @pytest.mark.fail_snapshot
     @pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
     def test_restart_keepalived(self, func_name, underlay, config,
-                                openstack_deployed, common_services_actions,
+                                openstack_deployed,  sl_os_deployed,
+                                common_services_actions,
                                 salt_actions, openstack_actions,
                                 rally, show_step):
         """Test restart keepalived on ctl* nodes
@@ -163,7 +164,8 @@
     @pytest.mark.fail_snapshot
     @pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
     def test_stop_keepalived(self, func_name, underlay, config,
-                             openstack_deployed, common_services_actions,
+                             openstack_deployed,  sl_os_deployed,
+                             common_services_actions,
                              salt_actions, openstack_actions,
                              rally, show_step):
         """Test stop keepalived on ctl node with VIP under load
@@ -249,7 +251,8 @@
     @pytest.mark.fail_snapshot
     @pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
     def test_kill_keepalived(self, func_name, underlay, config,
-                             openstack_deployed, common_services_actions,
+                             openstack_deployed,  sl_os_deployed,
+                             common_services_actions,
                              salt_actions, openstack_actions,
                              rally, show_step):
         """Test kill keepalived and haproxy on ctl node with VIP under load
@@ -341,6 +344,8 @@
         #        5. Check that SL sent a e-mail notification about the failed
         #        keepalived service, and then remove the VIP remaining
         #        on the node after killing keepalived.
+        #        Alternative: check prometheus alerts list on mon*:
+        #        curl http://localhost:15011/api/v1/alerts
 
         # Remove the VIP address manually because
         # the killed keepalived cannot do it
@@ -399,12 +404,13 @@
 
         for node_name, ps in ps_before.items():
             if node_name == new_minion_vip:
-                # Check that keepalived actually stopped on <minion_vip> node
+                # Check that haproxy has been actually restarted
+                # on <new_minion_vip> node
                 assert ps_after[node_name] and (ps != ps_after[node_name]), (
                     "Haproxy wasn't restarted on node {0}: {1}"
                     .format(node_name, ps_after[node_name]))
             else:
-                # Check that keepalived on other ctl nodes was not restarted
+                # Check that haproxy on other ctl nodes was not restarted
                 assert ps == ps_after[node_name], (
                    "Haproxy was restarted while it shouldn't on node {0}"
                    .format(node_name))
@@ -419,3 +425,178 @@
         assert not results['fail'], self.show_failed_msg(results['fail'])
 
         LOG.info("*************** DONE **************")
+
+    @pytest.mark.grab_versions
+    @pytest.mark.fail_snapshot
+    @pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
+    def test_kill_rabbit_galera(self, func_name, underlay, config,
+                                openstack_deployed, sl_os_deployed,
+                                common_services_actions,
+                                salt_actions, openstack_actions,
+                                rally, show_step):
+        """Test kill rabbitmq and galera on ctl node with VIP under load
+
+        Scenario:
+            1. Find controller minion id with VIP
+            2. Set rabbitmq_server to be killed on a random ctl node
+               in few minutes, TR case #3385677
+            3. Run rally task to generate load
+            4. Check that rabbitmq_server was killed on the ctl node with VIP
+            5. Find controller minion id with Galera which is receiving
+               connections
+            6. Set mysql server to be killed in few minutes, TR case #4753976
+            7. Run rally task to generate load
+            8. Check that mysql was killed and started again by systemd
+            9. Check galera cluster status and replication
+            10. Run tempest smoke after failover
+            11. Check tempest report for failed tests
+
+        Requiremets:
+            - Salt cluster
+            - OpenStack cluster
+        """
+        common_services_actions.check_keepalived_pillar()
+        salt = salt_actions
+
+        ctl_node_names = underlay.get_target_node_names(
+            target='ctl')
+
+        # Rabbitmq case
+        # STEP #1
+        show_step(1)
+        # Get the ps output with datetime of the process
+        ps_before = self.get_ps_time(
+            underlay, "rabbitmq_server", ctl_node_names)
+        assert all(["rabbitmq_server" in p for n, p in ps_before.items()]), (
+            "'rabbitmq_server' is not running on some nodes: {0}"
+            .format(ps_before))
+
+        ctl_vip_pillar = salt.get_pillar(
+            tgt="I@nova:controller:enabled:True",
+            pillar="_param:cluster_vip_address")[0]
+        vip = [vip for minion_id, vip in ctl_vip_pillar.items()][0]
+        ctl_minions = ctl_vip_pillar.keys()
+        minion_vip = common_services_actions.get_keepalived_vip_minion_id(vip)
+        LOG.info("VIP {0} is on {1}".format(vip, minion_vip))
+
+        # STEP #2
+        show_step(2)
+
+        ctl_minion = underlay.get_random_node(ctl_minions)
+        ctl_node_name = salt_actions.get_grains(
+            tgt=ctl_minion, grains='fqdn')[0][ctl_minion]
+        LOG.info("Scheduling to kill rabbitmq on the minion {0}"
+                 .format(ctl_minion))
+        underlay.delayed_call(
+            "salt '{0}' cmd.run 'killall -9 -u rabbitmq'".format(ctl_minion),
+            host=config.salt.salt_master_host,
+            delay_min=2,
+            delay_max=3)
+
+        LOG.info("'at -l':\n" + underlay.check_call(
+            "at -l", host=config.salt.salt_master_host)['stdout_str'])
+
+        # STEP #3
+        show_step(3)
+        # Run rally task with created task file
+        self.create_and_run_rally_load_task(
+            rally, times=60, concurrency=4, timeout=900)
+
+        # STEP #4
+        show_step(4)
+        ps_after = self.get_ps_time(underlay,
+                                    "rabbitmq_server",
+                                    ctl_node_names)
+
+        for node_name, ps in ps_before.items():
+            if node_name == ctl_node_name:
+                # Check that rabbitmq_server has been actually stopped
+                # on <minion_vip> node
+                assert not ps_after[node_name], (
+                    "'rabbitmq_server' was not stopped on node {0}"
+                    .format(minion_vip))
+            else:
+                # Check that rabbitmq_server on other ctl nodes
+                # was not restarted
+                assert ps == ps_after[node_name], (
+                   "'rabbitmq_server' was restarted while it shouldn't!")
+
+        # Mysql case
+        # STEP #5
+        show_step(5)
+        # At first, ensure that mysql is running on all controllers
+        ps_before = self.get_ps_time(
+            underlay, "mysqld", ctl_node_names)
+        assert all(["mysqld" in p for n, p in ps_before.items()]), (
+            "'mysqld' is not running on some nodes: {0}"
+            .format(ps_before))
+
+        # Check haproxy status on the node with VIP and find the mysql backend
+        # which is receiving the connections
+        haproxy_status = common_services_actions.get_haproxy_status(minion_vip)
+        mysql_status = haproxy_status['mysql_cluster']
+        mysql_tgt = ''
+        scur = 0
+        for svname in mysql_status.keys():
+            if svname == "FRONTEND" or svname == "BACKEND":
+                continue
+            snew = int(mysql_status[svname]['scur'])
+            if scur < snew:
+                scur = snew
+                mysql_tgt = svname + '*'
+        assert scur > 0, ("No sessions to 'mysql_cluster' haproxy backend on "
+                          "the node with VIP, something wrong with cluster.")
+
+        # STEP #6
+        show_step(6)
+        LOG.info("Scheduling to kill mysqld on the minion {0}"
+                 .format(ctl_minion))
+        underlay.delayed_call(
+            "salt '{0}' cmd.run 'killall -9 -u mysql'".format(mysql_tgt),
+            host=config.salt.salt_master_host,
+            delay_min=2,
+            delay_max=3)
+
+        LOG.info("'at -l':\n" + underlay.check_call(
+            "at -l", host=config.salt.salt_master_host)['stdout_str'])
+
+        # STEP #7
+        show_step(7)
+        # Run rally task with created task file
+        self.create_and_run_rally_load_task(
+            rally, times=60, concurrency=4, timeout=900)
+
+        # STEP #8
+        show_step(8)
+        ret = salt.service_status("I@nova:controller:enabled:True",
+                                  "mysql")
+        LOG.info(ret)
+        ps_after = self.get_ps_time(underlay, "mysqld", ctl_node_names)
+
+        for node_name, ps in ps_before.items():
+            if node_name == minion_vip:
+                # Check that mysql actually restarted on <minion_vip> node
+                assert ps_after[node_name] and (ps != ps_after[node_name]), (
+                    "Mysql wasn't restarted on node {0}: {1}"
+                    .format(node_name, ps_after[node_name]))
+            else:
+                # Check that Mysql on other ctl nodes was not restarted
+                assert ps == ps_after[node_name], (
+                   "Mysql was restarted while it shouldn't on node {0}"
+                   .format(node_name))
+
+        # STEP #9
+        show_step(9)
+        # TODO(ddmitriev): check galera cluster status and replication
+        # like it was checked in OSTF.
+
+        # STEP #10
+        show_step(10)
+        results = rally.run_tempest(pattern='set=smoke',
+                                    report_prefix=func_name,
+                                    timeout=1800)
+        # Step #11
+        show_step(11)
+        assert not results['fail'], self.show_failed_msg(results['fail'])
+
+        LOG.info("*************** DONE **************")