[Tooling update] health_checks
* Added:
- contrail process list
- extended contrail_check to check nodetool status
- extended contrail_check to check processes uptime
* Code shugar fix: sorting import modules
Prod-Related: PROD-31970
Change-Id: I878f2307ce13800eec1558579b20fcefa49e2b25
diff --git a/_modules/health_checks.py b/_modules/health_checks.py
index 7a11ab5..9428fd5 100644
--- a/_modules/health_checks.py
+++ b/_modules/health_checks.py
@@ -1,12 +1,13 @@
+import datetime
import hashlib
-import requests
-import subprocess
-import socket
-import salt.utils
+import json
import logging
import os
import re
-import json
+import requests
+import salt.utils
+import socket
+import subprocess
import yaml
__author__ = "Dzmitry Stremkouski"
@@ -195,11 +196,60 @@
return True
-def contrail_check(target='I@opencontrail:control or I@opencontrail:collector or I@opencontrail:compute', target_type='compound', ignore_dead=False, **kwargs):
+def contrail_process_list(**kwargs):
+
+ ''' Retrieve contrail process pids and start_time '''
+
+ cmd = ['contrail-status', '-d']
+
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+ stdout, stderr = proc.communicate()
+
+ procs = {}
+ for line in stdout.split('\n'):
+ if re.findall('^(\S+).*pid ([0-9]+),.*$', line):
+ stat = line.split()
+ procs[stat[0]] = int(stat[3][:-1])
+
+ if kwargs.get('role', 'compute') == 'controller':
+
+ for service in ['zookeeper', 'ifmap-server']:
+ cmd = ['service', service, 'status']
+
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+ stdout, stderr = proc.communicate()
+
+ for line in stdout.split('\n'):
+ if re.findall('^(\S+).*process ([0-9]+)$', line):
+ stat = line.split()
+ procs[stat[0]] = int(stat[3])
+
+ ctime = int(datetime.datetime.now().strftime("%s"))
+ btime_re = re.compile(r"^btime (\d+)$", re.MULTILINE)
+ btime_groups = btime_re.search(open("/proc/stat").read())
+ btime = int(btime_groups.groups()[0])
+ clk_tck = os.sysconf(os.sysconf_names["SC_CLK_TCK"])
+ for proc in procs:
+ pid = procs[proc]
+ try:
+ with open('/proc/%s/stat' % str(pid), 'r') as f:
+ stat = f.read()
+ jitty_time = int(stat.split(') ')[1].split()[19]) / clk_tck
+ proc_uptime = ctime - btime - int(jitty_time)
+ except:
+ proc_uptime = 0
+ procs[proc] = { 'pid': pid, 'uptime': proc_uptime }
+
+ return procs
+
+
+def contrail_check(target='I@opencontrail:control or I@opencontrail:collector or I@opencontrail:compute', nodetool_target='I@opencontrail:control or I@opencontrail:collector', compute_target='I@opencontrail:compute', target_type='compound', nodetool_target_type='compound', compute_target_type='compound', nodetool_expected_size=3, proc_min_uptime=30, ignore_dead=False, **kwargs):
''' Verify contrail status returns nothing critical '''
agent = "Contrail status"
+
+ # Check #1 contrail-status
out = __salt__['saltutil.cmd']( tgt=target,
tgt_type=target_type,
fun='cmd.run',
@@ -225,9 +275,100 @@
if not _failed_minions(out, agent, failed_minions):
__context__['retcode'] = 2
return False
-
if kwargs.get("debug", False):
logger.info(validated)
+
+ # Check #2 nodetool
+ out = __salt__['saltutil.cmd']( tgt=nodetool_target,
+ tgt_type=nodetool_target_type,
+ fun='cmd.run',
+ arg=['nodetool status'],
+ timeout=5
+ ) or None
+
+ if not _minions_output(out, agent, ignore_dead):
+ __context__['retcode'] = 2
+ return False
+
+ failed_minions = []
+ pattern = '^UN'
+ prog = re.compile(pattern)
+
+ validated = []
+ for minion in out:
+ size = 0
+ for line in out[minion]['ret'].split('\n'):
+ if prog.match(line):
+ size += 1
+ if not size == nodetool_expected_size and minion not in failed_minions:
+ failed_minions.append(minion)
+ validated.append(minion)
+
+ if not _failed_minions(out, agent, failed_minions):
+ __context__['retcode'] = 2
+ return False
+ if kwargs.get("debug", False):
+ logger.info(validated)
+
+ # Check #3 process status control
+ out = __salt__['saltutil.cmd']( tgt=nodetool_target,
+ tgt_type=nodetool_target_type,
+ fun='health_checks.contrail_process_list',
+ arg=['role=controller'],
+ timeout=5
+ ) or None
+
+ if not _minions_output(out, agent, ignore_dead):
+ __context__['retcode'] = 2
+ return False
+
+ failed_minions = []
+ validated = []
+ for minion in out:
+ procs = out[minion]['ret']
+ for proc in procs:
+ proc_uptime = procs[proc]['uptime']
+ if proc_uptime < proc_min_uptime:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+ logger.error({'minion': minion, 'name': proc, 'uptime': proc_uptime})
+ validated.append(minion)
+
+ if not _failed_minions(out, agent, failed_minions):
+ __context__['retcode'] = 2
+ return False
+ if kwargs.get("debug", False):
+ logger.info(validated)
+
+ # Check #4 process status computes
+ out = __salt__['saltutil.cmd']( tgt=compute_target,
+ tgt_type=compute_target_type,
+ fun='health_checks.contrail_process_list',
+ timeout=5
+ ) or None
+
+ if not _minions_output(out, agent, ignore_dead):
+ __context__['retcode'] = 2
+ return False
+
+ failed_minions = []
+ validated = []
+ for minion in out:
+ procs = out[minion]['ret']
+ for proc in procs:
+ proc_uptime = procs[proc]['uptime']
+ if proc_uptime < proc_min_uptime:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+ logger.error({'minion': minion, 'name': proc, 'uptime': proc_uptime})
+ validated.append(minion)
+
+ if not _failed_minions(out, agent, failed_minions):
+ __context__['retcode'] = 2
+ return False
+ if kwargs.get("debug", False):
+ logger.info(validated)
+
return True