Add retry to check of the prometheus targets
* move common methods to sl_manager.py
* add retry to check prometheus targets
* fix address replace in salt-shared.yml
Change-Id: I33c6536c515ed0e967d11bf9cbf81899f399615e
Reviewed-on: https://review.gerrithub.io/378331
Reviewed-by: Dennis Dmitriev <dis.xcom@gmail.com>
Tested-by: Dennis Dmitriev <dis.xcom@gmail.com>
diff --git a/tcp_tests/managers/sl_manager.py b/tcp_tests/managers/sl_manager.py
index af75ff4..9ed13db 100644
--- a/tcp_tests/managers/sl_manager.py
+++ b/tcp_tests/managers/sl_manager.py
@@ -14,6 +14,8 @@
import os
+from devops.helpers import decorators
+
from tcp_tests.managers.execute_commands import ExecuteCommandsMixin
from tcp_tests.managers.clients.prometheus import prometheus_client
from tcp_tests import logger
@@ -97,3 +99,48 @@
r.download(
destination=file_path,
target=os.getcwd())
+
+ def check_docker_services(self, nodes, expected_services):
+ """Check presense of the specified docker services on all the nodes
+ :param nodes: list of strings, names of nodes to check
+ :param expected_services: list of strings, names of services to find
+ """
+ for node in nodes:
+ services_status = self.get_service_info_from_node(node)
+ assert len(services_status) == len(expected_services), \
+ 'Some services are missed on node {0}. ' \
+ 'Current service list: {1}\nExpected service list: {2}' \
+ .format(node, services_status, expected_services)
+ for service in expected_services:
+ assert service in services_status,\
+ 'Missing service {0} in {1}'.format(service, services_status)
+ assert '0' not in services_status.get(service),\
+ 'Service {0} failed to start'.format(service)
+
+ @decorators.retry(AssertionError, count=10, delay=5)
+ def check_prometheus_targets(self, nodes):
+ """Check the status for Prometheus targets
+ :param nodes: list of strings, names of nodes with keepalived VIP
+ """
+ prometheus_client = self.api
+ try:
+ current_targets = prometheus_client.get_targets()
+ except:
+ LOG.info('Restarting keepalived service on mon nodes...')
+ for node in nodes:
+ self._salt.local(tgt=node, fun='cmd.run',
+ args='systemctl restart keepalived')
+ LOG.warning(
+ 'Ip states after force restart {0}'.format(
+ self._salt.local(tgt='mon*',
+ fun='cmd.run', args='ip a')))
+ current_targets = prometheus_client.get_targets()
+
+ LOG.debug('Current targets after install {0}'
+ .format(current_targets))
+ # Assert that targets are up
+ for entry in current_targets:
+ assert 'up' in entry['health'], \
+ 'Next target is down {}'.format(entry)
+
+
diff --git a/tcp_tests/templates/shared-salt.yaml b/tcp_tests/templates/shared-salt.yaml
index bc9a14e..8ce1570 100644
--- a/tcp_tests/templates/shared-salt.yaml
+++ b/tcp_tests/templates/shared-salt.yaml
@@ -117,21 +117,22 @@
# 192.168.10 -> 10.16.0 (generated network for admin)
# 10.16.0 -> <external network>
# So let's replace constant networks to the keywords, and then keywords to the desired networks.
- find /srv/salt/reclass/ -type f -exec sed -i 's/192\.168\.10\./==IPV4_NET_ADMIN_PREFIX==/g' {} +
- find /srv/salt/reclass/ -type f -exec sed -i 's/172\.16\.10\./==IPV4_NET_CONTROL_PREFIX==/g' {} +
- find /srv/salt/reclass/ -type f -exec sed -i 's/10\.1\.0\./==IPV4_NET_TENANT_PREFIX==/g' {} +
- find /srv/salt/reclass/ -type f -exec sed -i 's/10\.16\.0\./==IPV4_NET_EXTERNAL_PREFIX==/g' {} +
+ export REPLACE_DIRS="/srv/salt/reclass/classes/ /srv/salt/reclass/nodes/"
+ find ${REPLACE_DIRS} -type f -exec sed -i 's/192\.168\.10\./==IPV4_NET_ADMIN_PREFIX==/g' {} +
+ find ${REPLACE_DIRS} -type f -exec sed -i 's/172\.16\.10\./==IPV4_NET_CONTROL_PREFIX==/g' {} +
+ find ${REPLACE_DIRS} -type f -exec sed -i 's/10\.1\.0\./==IPV4_NET_TENANT_PREFIX==/g' {} +
+ find ${REPLACE_DIRS} -type f -exec sed -i 's/10\.16\.0\./==IPV4_NET_EXTERNAL_PREFIX==/g' {} +
- find /srv/salt/reclass/ -type f -exec sed -i 's/==IPV4_NET_ADMIN_PREFIX==/{{ IPV4_NET_ADMIN_PREFIX }}./g' {} +
- find /srv/salt/reclass/ -type f -exec sed -i 's/==IPV4_NET_CONTROL_PREFIX==/{{ IPV4_NET_CONTROL_PREFIX }}./g' {} +
- find /srv/salt/reclass/ -type f -exec sed -i 's/==IPV4_NET_TENANT_PREFIX==/{{ IPV4_NET_TENANT_PREFIX }}./g' {} +
- find /srv/salt/reclass/ -type f -exec sed -i 's/==IPV4_NET_EXTERNAL_PREFIX==/{{ IPV4_NET_EXTERNAL_PREFIX }}./g' {} +
+ find ${REPLACE_DIRS} -type f -exec sed -i 's/==IPV4_NET_ADMIN_PREFIX==/{{ IPV4_NET_ADMIN_PREFIX }}./g' {} +
+ find ${REPLACE_DIRS} -type f -exec sed -i 's/==IPV4_NET_CONTROL_PREFIX==/{{ IPV4_NET_CONTROL_PREFIX }}./g' {} +
+ find ${REPLACE_DIRS} -type f -exec sed -i 's/==IPV4_NET_TENANT_PREFIX==/{{ IPV4_NET_TENANT_PREFIX }}./g' {} +
+ find ${REPLACE_DIRS} -type f -exec sed -i 's/==IPV4_NET_EXTERNAL_PREFIX==/{{ IPV4_NET_EXTERNAL_PREFIX }}./g' {} +
- find /srv/salt/reclass/ -type f -exec sed -i 's/apt_mk_version:.*/apt_mk_version: {{ REPOSITORY_SUITE }}/g' {} +
+ find ${REPLACE_DIRS} -type f -exec sed -i 's/apt_mk_version:.*/apt_mk_version: {{ REPOSITORY_SUITE }}/g' {} +
{%- if IS_CONTRAIL_LAB %}
# vSRX IPs for tcp-qa images have 172.16.10.90 hardcoded
- find /srv/salt/reclass/ -type f -exec sed -i 's/opencontrail_router01_address:.*/opencontrail_router01_address: 172.16.10.90/g' {} +
+ find ${REPLACE_DIRS} -type f -exec sed -i 's/opencontrail_router01_address:.*/opencontrail_router01_address: 172.16.10.90/g' {} +
{%- endif %}
# Disable checkouting the model from remote repository
diff --git a/tcp_tests/templates/virtual-mcp11-k8s-contrail/k8s.yaml b/tcp_tests/templates/virtual-mcp11-k8s-contrail/k8s.yaml
index 84766d9..8fc7977 100644
--- a/tcp_tests/templates/virtual-mcp11-k8s-contrail/k8s.yaml
+++ b/tcp_tests/templates/virtual-mcp11-k8s-contrail/k8s.yaml
@@ -21,8 +21,8 @@
# TODO Remove workaround when linklocal on kube-api VIP on ens3 works fine
- description: Replace kube-api VIP with IP of one controller
cmd: |
- find /srv/salt/reclass/ -type f -exec sed -i 's/ipf_addresses:\ \${_param:kubernetes_control_address}/ipf_addresses:\ \${_param:kubernetes_control_node01_address}/g' {} +
- find /srv/salt/reclass/ -type f -exec sed -i 's/ipf_port:\ 443/ipf_port:\ 6443/g' {} +
+ find /srv/salt/reclass/classes/ -type f -exec sed -i 's/ipf_addresses:\ \${_param:kubernetes_control_address}/ipf_addresses:\ \${_param:kubernetes_control_node01_address}/g' {} +
+ find /srv/salt/reclass/classes/ -type f -exec sed -i 's/ipf_port:\ 443/ipf_port:\ 6443/g' {} +
node_name: {{ HOSTNAME_CFG01 }}
retry: {count: 1, delay: 1}
skip_fail: false
diff --git a/tcp_tests/tests/system/test_install_mcp11_ovs_ocata.py b/tcp_tests/tests/system/test_install_mcp11_ovs_ocata.py
index 7c2f788..0ee71f4 100644
--- a/tcp_tests/tests/system/test_install_mcp11_ovs_ocata.py
+++ b/tcp_tests/tests/system/test_install_mcp11_ovs_ocata.py
@@ -46,7 +46,7 @@
@pytest.mark.cz8119
def test_mcp11_ocata_ovs_sl_install(self, underlay, config,
openstack_deployed,
- sl_deployed, sl_actions, show_step):
+ sl_deployed, show_step):
"""Test for deploying an mcp environment and check it
Scenario:
1. Prepare salt on hosts
@@ -54,9 +54,9 @@
3. Setup compute nodes
4. Get monitoring nodes
5. Check that docker services are running
- 6. Check current targets are UP
- 7. Check grafana dashboards
-
+ 6. Check current prometheus targets are UP
+ 7. Run SL component tests
+ 8. Download SL component tests report
"""
expected_service_list = ['monitoring_remote_storage_adapter',
'monitoring_server',
@@ -65,43 +65,25 @@
'monitoring_alertmanager',
'monitoring_remote_collector',
'monitoring_pushgateway']
- # STEP #4
- mon_nodes = sl_actions.get_monitoring_nodes()
+ show_step(4)
+ mon_nodes = sl_deployed.get_monitoring_nodes()
LOG.debug('Mon nodes list {0}'.format(mon_nodes))
- for node in mon_nodes:
- services_status = sl_actions.get_service_info_from_node(node)
- assert len(services_status) == len(expected_service_list), \
- 'Some services are missed on node {0}. ' \
- 'Current service list {1}'.format(node, services_status)
- for service in expected_service_list:
- assert service in services_status, \
- 'Missing service {0} in {1}'.format(service, services_status)
- assert '0' not in services_status.get(service), \
- 'Service {0} failed to start'.format(service)
- prometheus_client = sl_deployed.api
- try:
- current_targets = prometheus_client.get_targets()
- LOG.debug('Current targets after install {0}'.format(current_targets))
- except:
- LOG.info('Restarting keepalived service on mon nodes...')
- sl_actions._salt.local(tgt='mon*', fun='cmd.run',
- args='systemctl restart keepalived')
- LOG.warning(
- 'Ip states after force restart {0}'.format(
- sl_actions._salt.local(tgt='mon*',
- fun='cmd.run', args='ip a')))
- current_targets = prometheus_client.get_targets()
- LOG.debug('Current targets after install {0}'.format(current_targets))
- # Assert that targets are up
- for entry in current_targets:
- assert 'up' in entry['health'], \
- 'Next target is down {}'.format(entry)
+
+ show_step(5)
+ sl_deployed.check_docker_services(mon_nodes, expected_service_list)
+
+ show_step(6)
+ sl_deployed.check_prometheus_targets(mon_nodes)
+
+ show_step(7)
# Run SL component tetsts
- sl_actions.run_sl_functional_tests(
+ sl_deployed.run_sl_functional_tests(
'cfg01',
'/root/stacklight-pytest/stacklight_tests/tests/prometheus')
+
+ show_step(8)
# Download report
- sl_actions.download_sl_test_report(
+ sl_deployed.download_sl_test_report(
'cfg01',
'/root/stacklight-pytest/stacklight_tests')
LOG.info("*************** DONE **************")
@@ -128,7 +110,7 @@
@pytest.mark.cz8120
def test_mcp11_ocata_dvr_sl_install(self, underlay, config,
openstack_deployed,
- sl_deployed, sl_actions, show_step):
+ sl_deployed, show_step):
"""Test for deploying an mcp environment and check it
Scenario:
1. Prepare salt on hosts
@@ -136,9 +118,9 @@
3. Setup compute nodes
4. Get monitoring nodes
5. Check that docker services are running
- 6. Check current targets are UP
- 7. Check grafana dashboards
-
+ 6. Check current prometheus targets are UP
+ 7. Run SL component tests
+ 8. Download SL component tests report
"""
expected_service_list = ['monitoring_remote_storage_adapter',
'monitoring_server',
@@ -147,48 +129,25 @@
'monitoring_alertmanager',
'monitoring_remote_collector',
'monitoring_pushgateway']
- # STEP #4
- mon_nodes = sl_actions.get_monitoring_nodes()
+ show_step(4)
+ mon_nodes = sl_deployed.get_monitoring_nodes()
LOG.debug('Mon nodes list {0}'.format(mon_nodes))
- for node in mon_nodes:
- services_status = sl_actions.get_service_info_from_node(node)
- assert len(services_status) == len(expected_service_list), \
- 'Some services are missed on node {0}. ' \
- 'Current service list {1}'.format(node, services_status)
- for service in expected_service_list:
- assert service in services_status,\
- 'Missing service {0} in {1}'.format(service, services_status)
- assert '0' not in services_status.get(service),\
- 'Service {0} failed to start'.format(service)
- prometheus_client = sl_deployed.api
- try:
- current_targets = prometheus_client.get_targets()
- LOG.debug('Current targets after install {0}'.format(current_targets))
- except:
- LOG.info('Restarting keepalived service on mon nodes...')
- sl_actions._salt.local(tgt='mon*', fun='cmd.run',
- args='systemctl restart keepalived')
- LOG.warning(
- 'Ip states after force restart {0}'.format(
- sl_actions._salt.local(tgt='mon*',
- fun='cmd.run', args='ip a')))
- current_targets = prometheus_client.get_targets()
- LOG.debug('Current targets after install {0}'.format(current_targets))
- # Assert that targets are up
- for entry in current_targets:
- assert 'up' in entry['health'], \
- 'Next target is down {}'.format(entry)
- # Assert that targets are up
- for entry in current_targets:
- assert 'up' in entry['health'], \
- 'Next target is down {}'.format(entry)
- # Run SL component tetsts
- sl_actions.run_sl_functional_tests(
+ show_step(5)
+ sl_deployed.check_docker_services(mon_nodes, expected_service_list)
+
+ show_step(6)
+ sl_deployed.check_prometheus_targets(mon_nodes)
+
+ show_step(7)
+ # Run SL component tests
+ sl_deployed.run_sl_functional_tests(
'cfg01',
'/root/stacklight-pytest/stacklight_tests/tests/prometheus')
+
+ show_step(8)
# Download report
- sl_actions.download_sl_test_report(
+ sl_deployed.download_sl_test_report(
'cfg01',
'/root/stacklight-pytest/stacklight_tests')
LOG.info("*************** DONE **************")