Add HA test "kill keepalived, haproxy"

- also added time sync on '*' minions
  after reverting snapshot in the salt_deployed fixture;
  and after making a snapshot in the other fixtures

Change-Id: Ia5c5363bf55399422785f66e88e861c23cfab531
diff --git a/tcp_tests/fixtures/ceph_fixtures.py b/tcp_tests/fixtures/ceph_fixtures.py
index c294542..0b2ef50 100644
--- a/tcp_tests/fixtures/ceph_fixtures.py
+++ b/tcp_tests/fixtures/ceph_fixtures.py
@@ -40,7 +40,7 @@
 @pytest.fixture(scope='function')
 def ceph_deployed(revert_snapshot, request, config,
                   hardware, underlay, common_services_deployed,
-                  ceph_actions):
+                  salt_deployed, ceph_actions):
     """Fixture to get or install Ceph services on environment
 
     :param revert_snapshot: fixture that reverts snapshot that is specified
@@ -72,6 +72,7 @@
         commands = underlay.read_template(steps_path)
         ceph_actions.install(commands)
         hardware.create_snapshot(ext.SNAPSHOT.ceph_deployed)
+        salt_deployed.sync_time()
 
     else:
         # 1. hardware environment created and powered on
diff --git a/tcp_tests/fixtures/common_services_fixtures.py b/tcp_tests/fixtures/common_services_fixtures.py
index 5d4c56a..7d1c73f 100644
--- a/tcp_tests/fixtures/common_services_fixtures.py
+++ b/tcp_tests/fixtures/common_services_fixtures.py
@@ -71,6 +71,7 @@
         commands = underlay.read_template(steps_path)
         common_services_actions.install(commands)
         hardware.create_snapshot(ext.SNAPSHOT.common_services_deployed)
+        salt_deployed.sync_time()
 
     else:
         # 1. hardware environment created and powered on
diff --git a/tcp_tests/fixtures/decapod_fixtures.py b/tcp_tests/fixtures/decapod_fixtures.py
index 7f064c5..8e40b41 100644
--- a/tcp_tests/fixtures/decapod_fixtures.py
+++ b/tcp_tests/fixtures/decapod_fixtures.py
@@ -68,6 +68,7 @@
         commands = underlay.read_template(steps_path)
         decapod_actions.install(commands)
         hardware.create_snapshot(ext.SNAPSHOT.decapod_deployed)
+        salt_deployed.sync_time()
 
     else:
         # 1. hardware environment created and powered on
diff --git a/tcp_tests/fixtures/k8s_fixtures.py b/tcp_tests/fixtures/k8s_fixtures.py
index 3cacbaf..356a51b 100644
--- a/tcp_tests/fixtures/k8s_fixtures.py
+++ b/tcp_tests/fixtures/k8s_fixtures.py
@@ -38,7 +38,7 @@
 @pytest.mark.revert_snapshot(ext.SNAPSHOT.k8s_deployed)
 @pytest.fixture(scope='function')
 def k8s_deployed(revert_snapshot, request, config, hardware, underlay,
-                 common_services_deployed, k8s_actions):
+                 common_services_deployed, salt_deployed, k8s_actions):
     """Fixture to get or install k8s on environment
 
     :param revert_snapshot: fixture that reverts snapshot that is specified
@@ -71,6 +71,7 @@
         commands = underlay.read_template(steps_path)
         k8s_actions.install(commands)
         hardware.create_snapshot(ext.SNAPSHOT.k8s_deployed)
+        salt_deployed.sync_time()
 
     # Workaround for keepalived hang issue after env revert from snapshot
     # see https://mirantis.jira.com/browse/PROD-12038
diff --git a/tcp_tests/fixtures/openstack_fixtures.py b/tcp_tests/fixtures/openstack_fixtures.py
index 1926299..8e92e77 100644
--- a/tcp_tests/fixtures/openstack_fixtures.py
+++ b/tcp_tests/fixtures/openstack_fixtures.py
@@ -41,7 +41,7 @@
 @pytest.fixture(scope='function')
 def openstack_deployed(revert_snapshot, request, config,
                        hardware, underlay, common_services_deployed,
-                       openstack_actions, rally):
+                       salt_deployed, openstack_actions, rally):
     """Fixture to get or install OpenStack services on environment
 
     :param revert_snapshot: fixture that reverts snapshot that is specified
@@ -99,6 +99,7 @@
             rally.run_container()
 
         hardware.create_snapshot(ext.SNAPSHOT.openstack_deployed)
+        salt_deployed.sync_time()
 
     else:
         # 1. hardware environment created and powered on
diff --git a/tcp_tests/fixtures/oss_fixtures.py b/tcp_tests/fixtures/oss_fixtures.py
index d46427b..a74313b 100644
--- a/tcp_tests/fixtures/oss_fixtures.py
+++ b/tcp_tests/fixtures/oss_fixtures.py
@@ -68,6 +68,7 @@
         commands = underlay.read_template(steps_path)
         oss_actions.install(commands)
         hardware.create_snapshot(ext.SNAPSHOT.oss_deployed)
+        salt_deployed.sync_time()
 
     else:
         # 1. hardware environment created and powered on
diff --git a/tcp_tests/fixtures/salt_fixtures.py b/tcp_tests/fixtures/salt_fixtures.py
index d72b1fc..aff28dc 100644
--- a/tcp_tests/fixtures/salt_fixtures.py
+++ b/tcp_tests/fixtures/salt_fixtures.py
@@ -78,6 +78,7 @@
                         for n in config.underlay.ssh)]
 
         hardware.create_snapshot(ext.SNAPSHOT.salt_deployed)
+        salt_actions.sync_time()
 
     else:
         # 1. hardware environment created and powered on
@@ -87,4 +88,6 @@
         #    installed TCP API endpoint
         pass
 
+    salt_actions.sync_time()
+
     return salt_actions
diff --git a/tcp_tests/fixtures/stacklight_fixtures.py b/tcp_tests/fixtures/stacklight_fixtures.py
index 8028383..c1747b8 100644
--- a/tcp_tests/fixtures/stacklight_fixtures.py
+++ b/tcp_tests/fixtures/stacklight_fixtures.py
@@ -39,7 +39,7 @@
 @pytest.fixture(scope='function')
 def sl_deployed(revert_snapshot, request, config,
                 hardware, underlay, common_services_deployed,
-                sl_actions):
+                salt_deployed, sl_actions):
     """Fixture to get or install SL services on environment
 
     :param revert_snapshot: fixture that reverts snapshot that is specified
@@ -57,6 +57,7 @@
         commands = underlay.read_template(steps_path)
         sl_actions.install(commands)
         hardware.create_snapshot(ext.SNAPSHOT.sl_deployed)
+        salt_deployed.sync_time()
 
     else:
         # 1. hardware environment created and powered on
diff --git a/tcp_tests/managers/common_services_manager.py b/tcp_tests/managers/common_services_manager.py
index e29cdd6..1e783a8 100644
--- a/tcp_tests/managers/common_services_manager.py
+++ b/tcp_tests/managers/common_services_manager.py
@@ -41,6 +41,9 @@
         """Get minion ID where keepalived VIP is at the moment"""
         tgt = 'I@keepalived:cluster:enabled:True'
         grains = 'ip_interfaces'
+        # Refresh grains first
+        self._salt.run_state(tgt, 'saltutil.refresh_grains')
+        # Get grains
         result = self._salt.get_grains(tgt=tgt, grains=grains)[0]
         minion_ids = [
             minion_id for minion_id, interfaces in result.items()
diff --git a/tcp_tests/managers/rallymanager.py b/tcp_tests/managers/rallymanager.py
index 87f8805..8282bcc 100644
--- a/tcp_tests/managers/rallymanager.py
+++ b/tcp_tests/managers/rallymanager.py
@@ -72,8 +72,8 @@
         docker_cmd = ('docker exec -i {docker_id} bash -c "{cmd}"'
                       .format(cmd=cmd, docker_id=self.docker_id))
         LOG.info("Executing: {docker_cmd}".format(docker_cmd=docker_cmd))
-        self._underlay.check_call(docker_cmd, node_name=self._node_name,
-                                  verbose=verbose, timeout=timeout)
+        return self._underlay.check_call(docker_cmd, node_name=self._node_name,
+                                         verbose=verbose, timeout=timeout)
 
     def _run(self):
         """Start the rally container in the background"""
@@ -148,20 +148,26 @@
             task_path=task_path, task_content=task_content)
         self._underlay.check_call(cmd, node_name=self._node_name)
 
-    def run_task(self, task='', timeout=None, raise_on_timeout=True):
+    def run_task(self, task='', timeout=None, raise_on_timeout=True,
+                 verbose=False):
         """Run rally task
 
         :param taks: path to json or yaml file with the task definition
         :param raise_on_timeout: bool, ignore TimeoutError if False
+        :param verbose: show rally output to console if True
         """
         try:
-            self._docker_exec("rally task start {task}".format(task=task),
-                              timeout=timeout, verbose=True)
+            res = self._docker_exec(
+                "rally task start {task}".format(task=task),
+                timeout=timeout,
+                verbose=verbose)
         except error.TimeoutError:
             if raise_on_timeout:
                 raise
             else:
+                res = None
                 pass
+        return res
 
     # Updated to replace the OpenStackManager method run_tempest
     def run_tempest(self, conf_name='/var/lib/lvm_mcp.conf',
diff --git a/tcp_tests/managers/saltmanager.py b/tcp_tests/managers/saltmanager.py
index 5249186..1ff5324 100644
--- a/tcp_tests/managers/saltmanager.py
+++ b/tcp_tests/managers/saltmanager.py
@@ -17,7 +17,8 @@
 from collections import defaultdict
 
 from datetime import datetime
-from pepper.libpepper import Pepper
+from pepper import libpepper
+from tcp_tests.helpers import utils
 from tcp_tests import settings
 from tcp_tests import logger
 from tcp_tests.managers.execute_commands import ExecuteCommandsMixin
@@ -94,7 +95,7 @@
         url = "http://{host}:{port}".format(
             host=self.host, port=self.port)
         LOG.info("Connecting to Salt API {0}".format(url))
-        self.__api = Pepper(url)
+        self.__api = libpepper.Pepper(url)
         self.__session_start = login()
         return self.__api
 
@@ -208,3 +209,18 @@
     def service_stop(self, tgt, service):
         result = self.local(tgt=tgt, fun='service.stop', args=service)
         return result['return']
+
+    @utils.retry(3, exception=libpepper.PepperException)
+    def sync_time(self, tgt='*'):
+        LOG.info("NTP time sync on the salt minions '{0}'".format(tgt))
+        # Force authentication update on the next API access
+        # because previous authentication most probably is not valid
+        # before or after time sync.
+        self.__api = None
+        self.run_state(
+            tgt,
+            'cmd.run', 'service ntp stop; ntpd -gq; service ntp start')
+        new_time_res = self.run_state(tgt, 'cmd.run', 'date')
+        for node_name, time in sorted(new_time_res[0]['return'][0].items()):
+            LOG.info("{0}: {1}".format(node_name, time))
+        self.__api = None
diff --git a/tcp_tests/tests/system/test_failover_openstack_services.py b/tcp_tests/tests/system/test_failover_openstack_services.py
index 87159d6..16b4a8c 100644
--- a/tcp_tests/tests/system/test_failover_openstack_services.py
+++ b/tcp_tests/tests/system/test_failover_openstack_services.py
@@ -59,6 +59,38 @@
             '\n\n  '.join([(name + ': ' + detail)
                            for name, detail in failed.items()]))
 
+    def create_and_run_rally_load_task(
+            self, rally, times, concurrency, timeout, raise_on_timeout=False):
+
+        rally.create_rally_task('/root/rally/rally_load_task.json',
+                                rally_load_task(times, concurrency))
+        LOG.info("Running rally load task: {0} iterations with concurrency {1}"
+                 ", timeout: {2} sec".format(times, concurrency, timeout))
+
+        # Run rally task with created task file
+        res = rally.run_task('/home/rally/.rally/rally_load_task.json',
+                             timeout=timeout,
+                             raise_on_timeout=raise_on_timeout,
+                             verbose=False)
+        # LOG only lines related to the task iterations,
+        # skip all other setup/teardown messages
+        for line in res['stdout']:
+            if 'rally.task.runner' in line:
+                LOG.info(line.strip())
+
+    def get_ps_time(self, underlay, process_name, node_names):
+        """Get the started datetime of the process on the specified nodes
+
+        Returns the dict {<node_name>: <str>, } where <str> is the 'ps' output
+        """
+        res = {
+            node_name: underlay.check_call(
+                "ps -eo lstart,cmd|grep [^]]{0}".format(process_name),
+                node_name=node_name, raise_on_err=False)['stdout_str']
+            for node_name in node_names
+        }
+        return res
+
     @pytest.mark.grab_versions
     @pytest.mark.fail_snapshot
     @pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
@@ -88,12 +120,9 @@
             target='ctl')
 
         # Get the ps output with datetime of the process
-        ps_before = {
-            node_name: underlay.check_call(
-                "ps -eo lstart,cmd|grep [^]]keepalived",
-                node_name=node_name)['stdout_str']
-            for node_name in ctl_node_names
-        }
+        ps_before = self.get_ps_time(underlay, "keepalived", ctl_node_names)
+        assert all(["keepalived" in p for n, p in ps_before.items()]), (
+            "'keepalived' is not running on some nodes: {0}".format(ps_before))
 
         # STEP #1
         show_step(1)
@@ -105,27 +134,19 @@
 
         # STEP #2
         show_step(2)
-        # Create a task file in the directory that will be mounted to rally
-        rally.create_rally_task('/root/rally/rally_load_task.json',
-                                rally_load_task(times=60, concurrency=6))
         # Run rally task with created task file
-        rally.run_task('/home/rally/.rally/rally_load_task.json', timeout=900,
-                       raise_on_timeout=False)
+        self.create_and_run_rally_load_task(
+            rally, times=60, concurrency=6, timeout=900)
 
         # STEP #3
         show_step(3)
         ret = salt.service_status("I@nova:controller:enabled:True",
                                   "keepalived")
         LOG.info(ret)
-        ps_after = {
-            node_name: underlay.check_call(
-                "ps -eo lstart,cmd|grep [^]]keepalived",
-                node_name=node_name)['stdout_str']
-            for node_name in ctl_node_names
-        }
-
+        ps_after = self.get_ps_time(underlay, "keepalived", ctl_node_names)
         for node_name, ps in ps_before.items():
-            assert ps != ps_after[node_name], "Keepalived wasn't restarted!"
+            assert ps_after[node_name] and (ps != ps_after[node_name]), (
+                "Keepalived wasn't restarted on node {0}".format(node_name))
 
         # STEP #4
         show_step(4)
@@ -168,12 +189,9 @@
             target='ctl')
 
         # Get the ps output with datetime of the process
-        ps_before = {
-            node_name: underlay.check_call(
-                "ps -eo lstart,cmd|grep [^]]keepalived",
-                node_name=node_name)['stdout_str']
-            for node_name in ctl_node_names
-        }
+        ps_before = self.get_ps_time(underlay, "keepalived", ctl_node_names)
+        assert all(["keepalived" in p for n, p in ps_before.items()]), (
+            "'keepalived' is not running on some nodes: {0}".format(ps_before))
 
         # STEP #1
         show_step(1)
@@ -194,24 +212,16 @@
 
         # STEP #3
         show_step(3)
-        # Create a task file in the directory that will be mounted to rally
-        rally.create_rally_task('/root/rally/rally_load_task.json',
-                                rally_load_task(times=60, concurrency=6))
         # Run rally task with created task file
-        rally.run_task('/home/rally/.rally/rally_load_task.json', timeout=900,
-                       raise_on_timeout=False)
+        self.create_and_run_rally_load_task(
+            rally, times=60, concurrency=6, timeout=900)
 
         # STEP #4
         show_step(4)
         ret = salt.service_status("I@nova:controller:enabled:True",
                                   "keepalived")
         LOG.info(ret)
-        ps_after = {
-            node_name: underlay.check_call(
-                "ps -eo lstart,cmd|grep [^]]keepalived",
-                node_name=node_name, raise_on_err=False)['stdout_str']
-            for node_name in ctl_node_names
-        }
+        ps_after = self.get_ps_time(underlay, "keepalived", ctl_node_names)
 
         for node_name, ps in ps_before.items():
             if node_name == minion_vip:
@@ -234,3 +244,178 @@
         assert not results['fail'], self.show_failed_msg(results['fail'])
 
         LOG.info("*************** DONE **************")
+
+    @pytest.mark.grab_versions
+    @pytest.mark.fail_snapshot
+    @pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
+    def test_kill_keepalived(self, func_name, underlay, config,
+                             openstack_deployed, common_services_actions,
+                             salt_actions, openstack_actions,
+                             rally, show_step):
+        """Test kill keepalived and haproxy on ctl node with VIP under load
+
+        Scenario:
+            1. Find controller minion id with VIP
+            2. Set keepalived to be killed on the ctl node with VIP
+               in few minutes, TR case #3385683
+            3. Run rally task to generate load (some tasks should fail
+               because of step 2)
+            4. Check that keepalived was killed on the ctl node with VIP
+            5. Check that SL sent a e-mail notification about the failed
+                keepalived service, and then remove the VIP remaining
+                on the previous VIP node during running rally task with
+                load.
+            6. Check that VIP was actually migrated on a new node
+            7. Find controller minion id with migrated VIP
+            8. Set haproxy to be killed on the ctl node with VIP
+               in few minutes, TR case #4753980
+            9. Run rally task to generate load (some tasks should fail
+               because of step 7)
+            10. Check that haproxy was killed on the ctl node with VIP
+               and started again by systemd
+            11. Run tempest smoke after failover
+            12. Check tempest report for failed tests
+
+        Requiremets:
+            - Salt cluster
+            - OpenStack cluster
+        """
+        common_services_actions.check_keepalived_pillar()
+        salt = salt_actions
+
+        ctl_node_names = underlay.get_target_node_names(
+            target='ctl')
+
+        # Keepalived case
+        # STEP #1
+        show_step(1)
+        # Get the ps output with datetime of the process
+        ps_before = self.get_ps_time(underlay, "keepalived", ctl_node_names)
+        assert all(["keepalived" in p for n, p in ps_before.items()]), (
+            "'keepalived' is not running on some nodes: {0}".format(ps_before))
+
+        ctl_vip_pillar = salt.get_pillar(
+            tgt="I@nova:controller:enabled:True",
+            pillar="_param:cluster_vip_address")[0]
+        vip = [vip for minion_id, vip in ctl_vip_pillar.items()][0]
+        minion_vip = common_services_actions.get_keepalived_vip_minion_id(vip)
+        LOG.info("VIP {0} is on {1}".format(vip, minion_vip))
+
+        # STEP #2
+        show_step(2)
+        underlay.delayed_call(
+            "salt '{0}' cmd.run 'killall -9 keepalived'".format(minion_vip),
+            host=config.salt.salt_master_host,
+            delay_min=2,
+            delay_max=3)
+
+        LOG.info("'at -l':\n" + underlay.check_call(
+            "at -l", host=config.salt.salt_master_host)['stdout_str'])
+
+        # STEP #3
+        show_step(3)
+        # Run rally task with created task file
+        self.create_and_run_rally_load_task(
+            rally, times=60, concurrency=4, timeout=900)
+
+        # STEP #4
+        show_step(4)
+        ret = salt.service_status("I@nova:controller:enabled:True",
+                                  "keepalived")
+        LOG.info(ret)
+        ps_after = self.get_ps_time(underlay, "keepalived", ctl_node_names)
+
+        for node_name, ps in ps_before.items():
+            if node_name == minion_vip:
+                # Check that keepalived actually stopped on <minion_vip> node
+                assert not ps_after[node_name], (
+                    "Keepalived was not stopped on node {0}"
+                    .format(minion_vip))
+            else:
+                # Check that keepalived on other ctl nodes was not restarted
+                assert ps == ps_after[node_name], (
+                   "Keepalived was restarted while it shouldn't!")
+        # STEP #5
+        show_step(5)
+        # TODO(ddmitriev):
+        #        5. Check that SL sent a e-mail notification about the failed
+        #        keepalived service, and then remove the VIP remaining
+        #        on the node after killing keepalived.
+
+        # Remove the VIP address manually because
+        # the killed keepalived cannot do it
+        underlay.delayed_call(
+            "salt '{0}' cmd.run 'ip a d {1}/32 dev ens4'"
+            .format(minion_vip, vip),
+            host=config.salt.salt_master_host,
+            delay_min=2,
+            delay_max=3)
+        # Run rally task with created task file
+        self.create_and_run_rally_load_task(
+            rally, times=60, concurrency=4, timeout=900)
+
+        # STEP #6
+        show_step(6)
+        # Check that VIP has been actually migrated to a new node
+        new_minion_vip = common_services_actions.get_keepalived_vip_minion_id(
+            vip)
+        LOG.info("Migrated VIP {0} is on {1}".format(vip, new_minion_vip))
+        assert new_minion_vip != minion_vip, (
+            "VIP {0} wasn't migrated from {1} after killing keepalived!"
+            .format(vip, new_minion_vip))
+        common_services_actions.check_keepalived_pillar()
+
+        # Haproxy case
+        # STEP #7
+        show_step(7)
+        # Get the ps output with datetime of the process
+        ps_before = self.get_ps_time(underlay, "haproxy", ctl_node_names)
+        assert all(["haproxy" in p for n, p in ps_before.items()]), (
+            "'haproxy' is not running on some nodes: {0}".format(ps_before))
+
+        # STEP #8
+        show_step(8)
+        underlay.delayed_call(
+            "salt '{0}' cmd.run 'killall -9 haproxy'".format(new_minion_vip),
+            host=config.salt.salt_master_host,
+            delay_min=2,
+            delay_max=3)
+
+        LOG.info("'at -l':\n" + underlay.check_call(
+            "at -l", host=config.salt.salt_master_host)['stdout_str'])
+
+        # STEP #9
+        show_step(9)
+        # Run rally task with created task file
+        self.create_and_run_rally_load_task(
+            rally, times=200, concurrency=4, timeout=1800)
+
+        # STEP #10
+        show_step(10)
+        ret = salt.service_status("I@nova:controller:enabled:True",
+                                  "haproxy")
+        LOG.info(ret)
+        ps_after = self.get_ps_time(underlay, "haproxy", ctl_node_names)
+
+        for node_name, ps in ps_before.items():
+            if node_name == new_minion_vip:
+                # Check that keepalived actually stopped on <minion_vip> node
+                assert ps_after[node_name] and (ps != ps_after[node_name]), (
+                    "Haproxy wasn't restarted on node {0}: {1}"
+                    .format(node_name, ps_after[node_name]))
+            else:
+                # Check that keepalived on other ctl nodes was not restarted
+                assert ps == ps_after[node_name], (
+                   "Haproxy was restarted while it shouldn't on node {0}"
+                   .format(node_name))
+
+        # STEP #11
+        show_step(11)
+        results = rally.run_tempest(pattern='set=smoke',
+                                    report_prefix=func_name,
+                                    timeout=1800)
+        # Step #12
+        show_step(12)
+        assert not results['fail'], self.show_failed_msg(results['fail'])
+
+        LOG.info("*************** DONE **************")