[Tooling update] health_checks
* Added:
- compute vrouter namespaces list
* Fixed:
- OC4.x usecase for contrail-status and nodetool
- Disabled process list check for OC4.x vrouters.
Prod-Related: PROD-31970
Change-Id: I7e9409c366b0bbae61b05882b6315d03b28c6c86
diff --git a/README.rst b/README.rst
index bad75e7..9d41c97 100644
--- a/README.rst
+++ b/README.rst
@@ -718,11 +718,18 @@
salt-call health_checks.ntp_check
salt-call health_checks.ntp_check min_peers=2 max_stratum=2
+List vrouter namespaces on contrail computes:
+
+.. code-block:: bash
+
+ salt -C 'I@opencontrail:compute' health_checks.list_namespaces
+
Verify contrail nodes contrail-status output:
.. code-block:: bash
salt-call health_checks.contrail_check debug=True
+ salt-call health_checks.contrail_check nodetool_expected_size=2 debug=True
Verify galera cluster status:
diff --git a/_modules/health_checks.py b/_modules/health_checks.py
index 9428fd5..ea34371 100644
--- a/_modules/health_checks.py
+++ b/_modules/health_checks.py
@@ -245,15 +245,27 @@
def contrail_check(target='I@opencontrail:control or I@opencontrail:collector or I@opencontrail:compute', nodetool_target='I@opencontrail:control or I@opencontrail:collector', compute_target='I@opencontrail:compute', target_type='compound', nodetool_target_type='compound', compute_target_type='compound', nodetool_expected_size=3, proc_min_uptime=30, ignore_dead=False, **kwargs):
- ''' Verify contrail status returns nothing critical '''
+ ''' Verify contrail infrastructure '''
+
+ use_doctrail = False
+ oc_ver = str(__salt__['pillar.get']('_param:opencontrail_version'))
+ if len(oc_ver) > 1:
+ if oc_ver[0] == '4':
+ use_doctrail = True
agent = "Contrail status"
+ if use_doctrail:
+ # Compute nodes does not use doctrail still, but are in compund.
+ # In order to minimize salt calls we are using exception pipes
+ arg_cmd = 'test $(whereis -b doctrail | grep -c " ") -eq 0 && contrail-status || doctrail all contrail-status'
+ else:
+ arg_cmd = "contrail-status"
# Check #1 contrail-status
out = __salt__['saltutil.cmd']( tgt=target,
tgt_type=target_type,
fun='cmd.run',
- arg=['contrail-status'],
+ arg=[arg_cmd],
timeout=5
) or None
@@ -262,13 +274,16 @@
return False
failed_minions = []
- pattern = '^(==|$|\S+\s+(active|backup|inactive\s\(disabled\son\sboot\)))'
+ pattern = '^(==|\*+$|$|\S+\s+(active|backup|inactive\s\(disabled\son\sboot\)))'
prog = re.compile(pattern)
validated = []
for minion in out:
for line in out[minion]['ret'].split('\n'):
- if not prog.match(line) and minion not in failed_minions:
+ check_line = True
+ if " FOR NODE " in line:
+ check_line = False
+ if check_line and not prog.match(line) and minion not in failed_minions:
failed_minions.append(minion)
validated.append(minion)
@@ -278,11 +293,16 @@
if kwargs.get("debug", False):
logger.info(validated)
+ if use_doctrail:
+ arg_cmd = "doctrail all nodetool status"
+ else:
+ arg_cmd = "nodetool status"
+
# Check #2 nodetool
out = __salt__['saltutil.cmd']( tgt=nodetool_target,
tgt_type=nodetool_target_type,
fun='cmd.run',
- arg=['nodetool status'],
+ arg=[arg_cmd],
timeout=5
) or None
@@ -311,39 +331,80 @@
logger.info(validated)
# Check #3 process status control
- out = __salt__['saltutil.cmd']( tgt=nodetool_target,
- tgt_type=nodetool_target_type,
- fun='health_checks.contrail_process_list',
- arg=['role=controller'],
- timeout=5
- ) or None
- if not _minions_output(out, agent, ignore_dead):
- __context__['retcode'] = 2
- return False
+ # Contrail 4.x does not produce pid info from contrail-status -d
+ # Will skip this check and use another method further
+ # TODO: check process list state for oc4 env
+ if not use_doctrail:
- failed_minions = []
- validated = []
- for minion in out:
- procs = out[minion]['ret']
- for proc in procs:
- proc_uptime = procs[proc]['uptime']
- if proc_uptime < proc_min_uptime:
- if minion not in failed_minions:
- failed_minions.append(minion)
- logger.error({'minion': minion, 'name': proc, 'uptime': proc_uptime})
- validated.append(minion)
+ out = __salt__['saltutil.cmd']( tgt=nodetool_target,
+ tgt_type=nodetool_target_type,
+ fun='health_checks.contrail_process_list',
+ arg=['role=controller'],
+ timeout=5
+ ) or None
- if not _failed_minions(out, agent, failed_minions):
- __context__['retcode'] = 2
- return False
- if kwargs.get("debug", False):
- logger.info(validated)
+ if not _minions_output(out, agent, ignore_dead):
+ __context__['retcode'] = 2
+ return False
+
+ failed_minions = []
+ validated = []
+ for minion in out:
+ procs = out[minion]['ret']
+ for proc in procs:
+ proc_uptime = procs[proc]['uptime']
+ if proc_uptime < proc_min_uptime:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+ logger.error({'minion': minion, 'name': proc, 'uptime': proc_uptime})
+ validated.append(minion)
+
+ if not _failed_minions(out, agent, failed_minions):
+ __context__['retcode'] = 2
+ return False
+ if kwargs.get("debug", False):
+ logger.info(validated)
# Check #4 process status computes
+
+ # Contrail 4.x does not produce pid info from contrail-status -d
+ # Will skip this check and use another method further
+ # TODO: check process list state for oc4 env
+ if not use_doctrail:
+
+ out = __salt__['saltutil.cmd']( tgt=compute_target,
+ tgt_type=compute_target_type,
+ fun='health_checks.contrail_process_list',
+ timeout=5
+ ) or None
+
+ if not _minions_output(out, agent, ignore_dead):
+ __context__['retcode'] = 2
+ return False
+
+ failed_minions = []
+ validated = []
+ for minion in out:
+ procs = out[minion]['ret']
+ for proc in procs:
+ proc_uptime = procs[proc]['uptime']
+ if proc_uptime < proc_min_uptime:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+ logger.error({'minion': minion, 'name': proc, 'uptime': proc_uptime})
+ validated.append(minion)
+
+ if not _failed_minions(out, agent, failed_minions):
+ __context__['retcode'] = 2
+ return False
+ if kwargs.get("debug", False):
+ logger.info(validated)
+
+ # Check # 5 compute vrouter namespaces dumplicates check
out = __salt__['saltutil.cmd']( tgt=compute_target,
tgt_type=compute_target_type,
- fun='health_checks.contrail_process_list',
+ fun='health_checks.list_namespaces',
timeout=5
) or None
@@ -353,22 +414,26 @@
failed_minions = []
validated = []
+ all_namespaces = []
for minion in out:
- procs = out[minion]['ret']
- for proc in procs:
- proc_uptime = procs[proc]['uptime']
- if proc_uptime < proc_min_uptime:
+ namespaces = out[minion]['ret']
+ for ns in namespaces:
+ if ns['uuid'] not in all_namespaces:
+ all_namespaces.append(ns['uuid'])
+ else:
if minion not in failed_minions:
failed_minions.append(minion)
- logger.error({'minion': minion, 'name': proc, 'uptime': proc_uptime})
- validated.append(minion)
+ logger.error({'minion': minion, 'uuid': ns['uuid']})
+ validated.append(minion)
if not _failed_minions(out, agent, failed_minions):
+ logger.error("Duplicated SNAT vrouters found. Please reset their gateways")
__context__['retcode'] = 2
return False
if kwargs.get("debug", False):
logger.info(validated)
+ # TODO: peers check
return True
@@ -1769,3 +1834,32 @@
return True
+
+def list_namespaces(raw_output=False):
+
+ ''' JSON formatted ip netns dict '''
+
+ proc = subprocess.Popen(['ip', 'netns'], stdout=subprocess.PIPE)
+ stdout, stderr = proc.communicate()
+
+ namespaces = []
+
+ for line in stdout.split('\n'):
+ if len(line) > 0:
+ netns = {}
+ if raw_output:
+ netns['id'] = -2
+ netns['uuid'] = line
+ else:
+ line_splitted = line.split()
+ if len(line_splitted) > 1:
+ ns_uuid = line_splitted[0]
+ ns_id = int(line_splitted[2][:-1])
+ else:
+ ns_uuid = line
+ ns_id = -1
+ netns['id'] = ns_id
+ netns['uuid'] = ns_uuid
+ namespaces.append(netns)
+
+ return namespaces