Add HA test "kill keepalived, haproxy"
- also added time sync on '*' minions
after reverting snapshot in the salt_deployed fixture;
and after making a snapshot in the other fixtures
Change-Id: Ia5c5363bf55399422785f66e88e861c23cfab531
diff --git a/tcp_tests/fixtures/ceph_fixtures.py b/tcp_tests/fixtures/ceph_fixtures.py
index c294542..0b2ef50 100644
--- a/tcp_tests/fixtures/ceph_fixtures.py
+++ b/tcp_tests/fixtures/ceph_fixtures.py
@@ -40,7 +40,7 @@
@pytest.fixture(scope='function')
def ceph_deployed(revert_snapshot, request, config,
hardware, underlay, common_services_deployed,
- ceph_actions):
+ salt_deployed, ceph_actions):
"""Fixture to get or install Ceph services on environment
:param revert_snapshot: fixture that reverts snapshot that is specified
@@ -72,6 +72,7 @@
commands = underlay.read_template(steps_path)
ceph_actions.install(commands)
hardware.create_snapshot(ext.SNAPSHOT.ceph_deployed)
+ salt_deployed.sync_time()
else:
# 1. hardware environment created and powered on
diff --git a/tcp_tests/fixtures/common_services_fixtures.py b/tcp_tests/fixtures/common_services_fixtures.py
index 5d4c56a..7d1c73f 100644
--- a/tcp_tests/fixtures/common_services_fixtures.py
+++ b/tcp_tests/fixtures/common_services_fixtures.py
@@ -71,6 +71,7 @@
commands = underlay.read_template(steps_path)
common_services_actions.install(commands)
hardware.create_snapshot(ext.SNAPSHOT.common_services_deployed)
+ salt_deployed.sync_time()
else:
# 1. hardware environment created and powered on
diff --git a/tcp_tests/fixtures/decapod_fixtures.py b/tcp_tests/fixtures/decapod_fixtures.py
index 7f064c5..8e40b41 100644
--- a/tcp_tests/fixtures/decapod_fixtures.py
+++ b/tcp_tests/fixtures/decapod_fixtures.py
@@ -68,6 +68,7 @@
commands = underlay.read_template(steps_path)
decapod_actions.install(commands)
hardware.create_snapshot(ext.SNAPSHOT.decapod_deployed)
+ salt_deployed.sync_time()
else:
# 1. hardware environment created and powered on
diff --git a/tcp_tests/fixtures/k8s_fixtures.py b/tcp_tests/fixtures/k8s_fixtures.py
index 3cacbaf..356a51b 100644
--- a/tcp_tests/fixtures/k8s_fixtures.py
+++ b/tcp_tests/fixtures/k8s_fixtures.py
@@ -38,7 +38,7 @@
@pytest.mark.revert_snapshot(ext.SNAPSHOT.k8s_deployed)
@pytest.fixture(scope='function')
def k8s_deployed(revert_snapshot, request, config, hardware, underlay,
- common_services_deployed, k8s_actions):
+ common_services_deployed, salt_deployed, k8s_actions):
"""Fixture to get or install k8s on environment
:param revert_snapshot: fixture that reverts snapshot that is specified
@@ -71,6 +71,7 @@
commands = underlay.read_template(steps_path)
k8s_actions.install(commands)
hardware.create_snapshot(ext.SNAPSHOT.k8s_deployed)
+ salt_deployed.sync_time()
# Workaround for keepalived hang issue after env revert from snapshot
# see https://mirantis.jira.com/browse/PROD-12038
diff --git a/tcp_tests/fixtures/openstack_fixtures.py b/tcp_tests/fixtures/openstack_fixtures.py
index 1926299..8e92e77 100644
--- a/tcp_tests/fixtures/openstack_fixtures.py
+++ b/tcp_tests/fixtures/openstack_fixtures.py
@@ -41,7 +41,7 @@
@pytest.fixture(scope='function')
def openstack_deployed(revert_snapshot, request, config,
hardware, underlay, common_services_deployed,
- openstack_actions, rally):
+ salt_deployed, openstack_actions, rally):
"""Fixture to get or install OpenStack services on environment
:param revert_snapshot: fixture that reverts snapshot that is specified
@@ -99,6 +99,7 @@
rally.run_container()
hardware.create_snapshot(ext.SNAPSHOT.openstack_deployed)
+ salt_deployed.sync_time()
else:
# 1. hardware environment created and powered on
diff --git a/tcp_tests/fixtures/oss_fixtures.py b/tcp_tests/fixtures/oss_fixtures.py
index d46427b..a74313b 100644
--- a/tcp_tests/fixtures/oss_fixtures.py
+++ b/tcp_tests/fixtures/oss_fixtures.py
@@ -68,6 +68,7 @@
commands = underlay.read_template(steps_path)
oss_actions.install(commands)
hardware.create_snapshot(ext.SNAPSHOT.oss_deployed)
+ salt_deployed.sync_time()
else:
# 1. hardware environment created and powered on
diff --git a/tcp_tests/fixtures/salt_fixtures.py b/tcp_tests/fixtures/salt_fixtures.py
index d72b1fc..aff28dc 100644
--- a/tcp_tests/fixtures/salt_fixtures.py
+++ b/tcp_tests/fixtures/salt_fixtures.py
@@ -78,6 +78,7 @@
for n in config.underlay.ssh)]
hardware.create_snapshot(ext.SNAPSHOT.salt_deployed)
+ salt_actions.sync_time()
else:
# 1. hardware environment created and powered on
@@ -87,4 +88,6 @@
# installed TCP API endpoint
pass
+ salt_actions.sync_time()
+
return salt_actions
diff --git a/tcp_tests/fixtures/stacklight_fixtures.py b/tcp_tests/fixtures/stacklight_fixtures.py
index 8028383..c1747b8 100644
--- a/tcp_tests/fixtures/stacklight_fixtures.py
+++ b/tcp_tests/fixtures/stacklight_fixtures.py
@@ -39,7 +39,7 @@
@pytest.fixture(scope='function')
def sl_deployed(revert_snapshot, request, config,
hardware, underlay, common_services_deployed,
- sl_actions):
+ salt_deployed, sl_actions):
"""Fixture to get or install SL services on environment
:param revert_snapshot: fixture that reverts snapshot that is specified
@@ -57,6 +57,7 @@
commands = underlay.read_template(steps_path)
sl_actions.install(commands)
hardware.create_snapshot(ext.SNAPSHOT.sl_deployed)
+ salt_deployed.sync_time()
else:
# 1. hardware environment created and powered on
diff --git a/tcp_tests/managers/common_services_manager.py b/tcp_tests/managers/common_services_manager.py
index e29cdd6..1e783a8 100644
--- a/tcp_tests/managers/common_services_manager.py
+++ b/tcp_tests/managers/common_services_manager.py
@@ -41,6 +41,9 @@
"""Get minion ID where keepalived VIP is at the moment"""
tgt = 'I@keepalived:cluster:enabled:True'
grains = 'ip_interfaces'
+ # Refresh grains first
+ self._salt.run_state(tgt, 'saltutil.refresh_grains')
+ # Get grains
result = self._salt.get_grains(tgt=tgt, grains=grains)[0]
minion_ids = [
minion_id for minion_id, interfaces in result.items()
diff --git a/tcp_tests/managers/rallymanager.py b/tcp_tests/managers/rallymanager.py
index 87f8805..8282bcc 100644
--- a/tcp_tests/managers/rallymanager.py
+++ b/tcp_tests/managers/rallymanager.py
@@ -72,8 +72,8 @@
docker_cmd = ('docker exec -i {docker_id} bash -c "{cmd}"'
.format(cmd=cmd, docker_id=self.docker_id))
LOG.info("Executing: {docker_cmd}".format(docker_cmd=docker_cmd))
- self._underlay.check_call(docker_cmd, node_name=self._node_name,
- verbose=verbose, timeout=timeout)
+ return self._underlay.check_call(docker_cmd, node_name=self._node_name,
+ verbose=verbose, timeout=timeout)
def _run(self):
"""Start the rally container in the background"""
@@ -148,20 +148,26 @@
task_path=task_path, task_content=task_content)
self._underlay.check_call(cmd, node_name=self._node_name)
- def run_task(self, task='', timeout=None, raise_on_timeout=True):
+ def run_task(self, task='', timeout=None, raise_on_timeout=True,
+ verbose=False):
"""Run rally task
:param taks: path to json or yaml file with the task definition
:param raise_on_timeout: bool, ignore TimeoutError if False
+ :param verbose: show rally output to console if True
"""
try:
- self._docker_exec("rally task start {task}".format(task=task),
- timeout=timeout, verbose=True)
+ res = self._docker_exec(
+ "rally task start {task}".format(task=task),
+ timeout=timeout,
+ verbose=verbose)
except error.TimeoutError:
if raise_on_timeout:
raise
else:
+ res = None
pass
+ return res
# Updated to replace the OpenStackManager method run_tempest
def run_tempest(self, conf_name='/var/lib/lvm_mcp.conf',
diff --git a/tcp_tests/managers/saltmanager.py b/tcp_tests/managers/saltmanager.py
index 5249186..1ff5324 100644
--- a/tcp_tests/managers/saltmanager.py
+++ b/tcp_tests/managers/saltmanager.py
@@ -17,7 +17,8 @@
from collections import defaultdict
from datetime import datetime
-from pepper.libpepper import Pepper
+from pepper import libpepper
+from tcp_tests.helpers import utils
from tcp_tests import settings
from tcp_tests import logger
from tcp_tests.managers.execute_commands import ExecuteCommandsMixin
@@ -94,7 +95,7 @@
url = "http://{host}:{port}".format(
host=self.host, port=self.port)
LOG.info("Connecting to Salt API {0}".format(url))
- self.__api = Pepper(url)
+ self.__api = libpepper.Pepper(url)
self.__session_start = login()
return self.__api
@@ -208,3 +209,18 @@
def service_stop(self, tgt, service):
result = self.local(tgt=tgt, fun='service.stop', args=service)
return result['return']
+
+ @utils.retry(3, exception=libpepper.PepperException)
+ def sync_time(self, tgt='*'):
+ LOG.info("NTP time sync on the salt minions '{0}'".format(tgt))
+ # Force authentication update on the next API access
+ # because previous authentication most probably is not valid
+ # before or after time sync.
+ self.__api = None
+ self.run_state(
+ tgt,
+ 'cmd.run', 'service ntp stop; ntpd -gq; service ntp start')
+ new_time_res = self.run_state(tgt, 'cmd.run', 'date')
+ for node_name, time in sorted(new_time_res[0]['return'][0].items()):
+ LOG.info("{0}: {1}".format(node_name, time))
+ self.__api = None
diff --git a/tcp_tests/tests/system/test_failover_openstack_services.py b/tcp_tests/tests/system/test_failover_openstack_services.py
index 87159d6..16b4a8c 100644
--- a/tcp_tests/tests/system/test_failover_openstack_services.py
+++ b/tcp_tests/tests/system/test_failover_openstack_services.py
@@ -59,6 +59,38 @@
'\n\n '.join([(name + ': ' + detail)
for name, detail in failed.items()]))
+ def create_and_run_rally_load_task(
+ self, rally, times, concurrency, timeout, raise_on_timeout=False):
+
+ rally.create_rally_task('/root/rally/rally_load_task.json',
+ rally_load_task(times, concurrency))
+ LOG.info("Running rally load task: {0} iterations with concurrency {1}"
+ ", timeout: {2} sec".format(times, concurrency, timeout))
+
+ # Run rally task with created task file
+ res = rally.run_task('/home/rally/.rally/rally_load_task.json',
+ timeout=timeout,
+ raise_on_timeout=raise_on_timeout,
+ verbose=False)
+ # LOG only lines related to the task iterations,
+ # skip all other setup/teardown messages
+ for line in res['stdout']:
+ if 'rally.task.runner' in line:
+ LOG.info(line.strip())
+
+ def get_ps_time(self, underlay, process_name, node_names):
+ """Get the started datetime of the process on the specified nodes
+
+ Returns the dict {<node_name>: <str>, } where <str> is the 'ps' output
+ """
+ res = {
+ node_name: underlay.check_call(
+ "ps -eo lstart,cmd|grep [^]]{0}".format(process_name),
+ node_name=node_name, raise_on_err=False)['stdout_str']
+ for node_name in node_names
+ }
+ return res
+
@pytest.mark.grab_versions
@pytest.mark.fail_snapshot
@pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
@@ -88,12 +120,9 @@
target='ctl')
# Get the ps output with datetime of the process
- ps_before = {
- node_name: underlay.check_call(
- "ps -eo lstart,cmd|grep [^]]keepalived",
- node_name=node_name)['stdout_str']
- for node_name in ctl_node_names
- }
+ ps_before = self.get_ps_time(underlay, "keepalived", ctl_node_names)
+ assert all(["keepalived" in p for n, p in ps_before.items()]), (
+ "'keepalived' is not running on some nodes: {0}".format(ps_before))
# STEP #1
show_step(1)
@@ -105,27 +134,19 @@
# STEP #2
show_step(2)
- # Create a task file in the directory that will be mounted to rally
- rally.create_rally_task('/root/rally/rally_load_task.json',
- rally_load_task(times=60, concurrency=6))
# Run rally task with created task file
- rally.run_task('/home/rally/.rally/rally_load_task.json', timeout=900,
- raise_on_timeout=False)
+ self.create_and_run_rally_load_task(
+ rally, times=60, concurrency=6, timeout=900)
# STEP #3
show_step(3)
ret = salt.service_status("I@nova:controller:enabled:True",
"keepalived")
LOG.info(ret)
- ps_after = {
- node_name: underlay.check_call(
- "ps -eo lstart,cmd|grep [^]]keepalived",
- node_name=node_name)['stdout_str']
- for node_name in ctl_node_names
- }
-
+ ps_after = self.get_ps_time(underlay, "keepalived", ctl_node_names)
for node_name, ps in ps_before.items():
- assert ps != ps_after[node_name], "Keepalived wasn't restarted!"
+ assert ps_after[node_name] and (ps != ps_after[node_name]), (
+ "Keepalived wasn't restarted on node {0}".format(node_name))
# STEP #4
show_step(4)
@@ -168,12 +189,9 @@
target='ctl')
# Get the ps output with datetime of the process
- ps_before = {
- node_name: underlay.check_call(
- "ps -eo lstart,cmd|grep [^]]keepalived",
- node_name=node_name)['stdout_str']
- for node_name in ctl_node_names
- }
+ ps_before = self.get_ps_time(underlay, "keepalived", ctl_node_names)
+ assert all(["keepalived" in p for n, p in ps_before.items()]), (
+ "'keepalived' is not running on some nodes: {0}".format(ps_before))
# STEP #1
show_step(1)
@@ -194,24 +212,16 @@
# STEP #3
show_step(3)
- # Create a task file in the directory that will be mounted to rally
- rally.create_rally_task('/root/rally/rally_load_task.json',
- rally_load_task(times=60, concurrency=6))
# Run rally task with created task file
- rally.run_task('/home/rally/.rally/rally_load_task.json', timeout=900,
- raise_on_timeout=False)
+ self.create_and_run_rally_load_task(
+ rally, times=60, concurrency=6, timeout=900)
# STEP #4
show_step(4)
ret = salt.service_status("I@nova:controller:enabled:True",
"keepalived")
LOG.info(ret)
- ps_after = {
- node_name: underlay.check_call(
- "ps -eo lstart,cmd|grep [^]]keepalived",
- node_name=node_name, raise_on_err=False)['stdout_str']
- for node_name in ctl_node_names
- }
+ ps_after = self.get_ps_time(underlay, "keepalived", ctl_node_names)
for node_name, ps in ps_before.items():
if node_name == minion_vip:
@@ -234,3 +244,178 @@
assert not results['fail'], self.show_failed_msg(results['fail'])
LOG.info("*************** DONE **************")
+
+ @pytest.mark.grab_versions
+ @pytest.mark.fail_snapshot
+ @pytest.mark.with_rally(rally_node="gtw01.", prepare_openstack=True)
+ def test_kill_keepalived(self, func_name, underlay, config,
+ openstack_deployed, common_services_actions,
+ salt_actions, openstack_actions,
+ rally, show_step):
+ """Test kill keepalived and haproxy on ctl node with VIP under load
+
+ Scenario:
+ 1. Find controller minion id with VIP
+ 2. Set keepalived to be killed on the ctl node with VIP
+ in few minutes, TR case #3385683
+ 3. Run rally task to generate load (some tasks should fail
+ because of step 2)
+ 4. Check that keepalived was killed on the ctl node with VIP
+ 5. Check that SL sent a e-mail notification about the failed
+ keepalived service, and then remove the VIP remaining
+ on the previous VIP node during running rally task with
+ load.
+ 6. Check that VIP was actually migrated on a new node
+ 7. Find controller minion id with migrated VIP
+ 8. Set haproxy to be killed on the ctl node with VIP
+ in few minutes, TR case #4753980
+ 9. Run rally task to generate load (some tasks should fail
+ because of step 7)
+ 10. Check that haproxy was killed on the ctl node with VIP
+ and started again by systemd
+ 11. Run tempest smoke after failover
+ 12. Check tempest report for failed tests
+
+ Requiremets:
+ - Salt cluster
+ - OpenStack cluster
+ """
+ common_services_actions.check_keepalived_pillar()
+ salt = salt_actions
+
+ ctl_node_names = underlay.get_target_node_names(
+ target='ctl')
+
+ # Keepalived case
+ # STEP #1
+ show_step(1)
+ # Get the ps output with datetime of the process
+ ps_before = self.get_ps_time(underlay, "keepalived", ctl_node_names)
+ assert all(["keepalived" in p for n, p in ps_before.items()]), (
+ "'keepalived' is not running on some nodes: {0}".format(ps_before))
+
+ ctl_vip_pillar = salt.get_pillar(
+ tgt="I@nova:controller:enabled:True",
+ pillar="_param:cluster_vip_address")[0]
+ vip = [vip for minion_id, vip in ctl_vip_pillar.items()][0]
+ minion_vip = common_services_actions.get_keepalived_vip_minion_id(vip)
+ LOG.info("VIP {0} is on {1}".format(vip, minion_vip))
+
+ # STEP #2
+ show_step(2)
+ underlay.delayed_call(
+ "salt '{0}' cmd.run 'killall -9 keepalived'".format(minion_vip),
+ host=config.salt.salt_master_host,
+ delay_min=2,
+ delay_max=3)
+
+ LOG.info("'at -l':\n" + underlay.check_call(
+ "at -l", host=config.salt.salt_master_host)['stdout_str'])
+
+ # STEP #3
+ show_step(3)
+ # Run rally task with created task file
+ self.create_and_run_rally_load_task(
+ rally, times=60, concurrency=4, timeout=900)
+
+ # STEP #4
+ show_step(4)
+ ret = salt.service_status("I@nova:controller:enabled:True",
+ "keepalived")
+ LOG.info(ret)
+ ps_after = self.get_ps_time(underlay, "keepalived", ctl_node_names)
+
+ for node_name, ps in ps_before.items():
+ if node_name == minion_vip:
+ # Check that keepalived actually stopped on <minion_vip> node
+ assert not ps_after[node_name], (
+ "Keepalived was not stopped on node {0}"
+ .format(minion_vip))
+ else:
+ # Check that keepalived on other ctl nodes was not restarted
+ assert ps == ps_after[node_name], (
+ "Keepalived was restarted while it shouldn't!")
+ # STEP #5
+ show_step(5)
+ # TODO(ddmitriev):
+ # 5. Check that SL sent a e-mail notification about the failed
+ # keepalived service, and then remove the VIP remaining
+ # on the node after killing keepalived.
+
+ # Remove the VIP address manually because
+ # the killed keepalived cannot do it
+ underlay.delayed_call(
+ "salt '{0}' cmd.run 'ip a d {1}/32 dev ens4'"
+ .format(minion_vip, vip),
+ host=config.salt.salt_master_host,
+ delay_min=2,
+ delay_max=3)
+ # Run rally task with created task file
+ self.create_and_run_rally_load_task(
+ rally, times=60, concurrency=4, timeout=900)
+
+ # STEP #6
+ show_step(6)
+ # Check that VIP has been actually migrated to a new node
+ new_minion_vip = common_services_actions.get_keepalived_vip_minion_id(
+ vip)
+ LOG.info("Migrated VIP {0} is on {1}".format(vip, new_minion_vip))
+ assert new_minion_vip != minion_vip, (
+ "VIP {0} wasn't migrated from {1} after killing keepalived!"
+ .format(vip, new_minion_vip))
+ common_services_actions.check_keepalived_pillar()
+
+ # Haproxy case
+ # STEP #7
+ show_step(7)
+ # Get the ps output with datetime of the process
+ ps_before = self.get_ps_time(underlay, "haproxy", ctl_node_names)
+ assert all(["haproxy" in p for n, p in ps_before.items()]), (
+ "'haproxy' is not running on some nodes: {0}".format(ps_before))
+
+ # STEP #8
+ show_step(8)
+ underlay.delayed_call(
+ "salt '{0}' cmd.run 'killall -9 haproxy'".format(new_minion_vip),
+ host=config.salt.salt_master_host,
+ delay_min=2,
+ delay_max=3)
+
+ LOG.info("'at -l':\n" + underlay.check_call(
+ "at -l", host=config.salt.salt_master_host)['stdout_str'])
+
+ # STEP #9
+ show_step(9)
+ # Run rally task with created task file
+ self.create_and_run_rally_load_task(
+ rally, times=200, concurrency=4, timeout=1800)
+
+ # STEP #10
+ show_step(10)
+ ret = salt.service_status("I@nova:controller:enabled:True",
+ "haproxy")
+ LOG.info(ret)
+ ps_after = self.get_ps_time(underlay, "haproxy", ctl_node_names)
+
+ for node_name, ps in ps_before.items():
+ if node_name == new_minion_vip:
+ # Check that keepalived actually stopped on <minion_vip> node
+ assert ps_after[node_name] and (ps != ps_after[node_name]), (
+ "Haproxy wasn't restarted on node {0}: {1}"
+ .format(node_name, ps_after[node_name]))
+ else:
+ # Check that keepalived on other ctl nodes was not restarted
+ assert ps == ps_after[node_name], (
+ "Haproxy was restarted while it shouldn't on node {0}"
+ .format(node_name))
+
+ # STEP #11
+ show_step(11)
+ results = rally.run_tempest(pattern='set=smoke',
+ report_prefix=func_name,
+ timeout=1800)
+ # Step #12
+ show_step(12)
+ assert not results['fail'], self.show_failed_msg(results['fail'])
+
+ LOG.info("*************** DONE **************")