[Tooling update] health_checks
* Added:
- contrail process list
- extended contrail_check to check nodetool status
- extended contrail_check to check processes uptime
* Code shugar fix: sorting import modules
Prod-Related: PROD-31970
Change-Id: I878f2307ce13800eec1558579b20fcefa49e2b25
diff --git a/_modules/health_checks.py b/_modules/health_checks.py
index 7a11ab5..9428fd5 100644
--- a/_modules/health_checks.py
+++ b/_modules/health_checks.py
@@ -1,12 +1,13 @@
+import datetime
 import hashlib
-import requests
-import subprocess
-import socket
-import salt.utils
+import json
 import logging
 import os
 import re
-import json
+import requests
+import salt.utils
+import socket
+import subprocess
 import yaml
 
 __author__ = "Dzmitry Stremkouski"
@@ -195,11 +196,60 @@
     return True
 
 
-def contrail_check(target='I@opencontrail:control or I@opencontrail:collector or I@opencontrail:compute', target_type='compound', ignore_dead=False, **kwargs):
+def contrail_process_list(**kwargs):
+
+    ''' Retrieve contrail process pids and start_time '''
+
+    cmd = ['contrail-status', '-d']
+
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    stdout, stderr =  proc.communicate()
+
+    procs = {}
+    for line in stdout.split('\n'):
+        if re.findall('^(\S+).*pid ([0-9]+),.*$', line):
+            stat = line.split()
+            procs[stat[0]] = int(stat[3][:-1])
+
+    if kwargs.get('role', 'compute') == 'controller':
+
+        for service in ['zookeeper', 'ifmap-server']:
+            cmd = ['service', service, 'status']
+
+            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+            stdout, stderr =  proc.communicate()
+
+            for line in stdout.split('\n'):
+                if re.findall('^(\S+).*process ([0-9]+)$', line):
+                    stat = line.split()
+                    procs[stat[0]] = int(stat[3])
+
+    ctime = int(datetime.datetime.now().strftime("%s"))
+    btime_re = re.compile(r"^btime (\d+)$", re.MULTILINE)
+    btime_groups = btime_re.search(open("/proc/stat").read())
+    btime = int(btime_groups.groups()[0])
+    clk_tck = os.sysconf(os.sysconf_names["SC_CLK_TCK"])
+    for proc in procs:
+        pid = procs[proc]
+        try:
+            with open('/proc/%s/stat' % str(pid), 'r') as f:
+                stat = f.read()
+                jitty_time = int(stat.split(') ')[1].split()[19]) / clk_tck
+            proc_uptime = ctime - btime - int(jitty_time)
+        except:
+            proc_uptime = 0
+        procs[proc] = { 'pid': pid, 'uptime': proc_uptime }
+
+    return procs
+
+
+def contrail_check(target='I@opencontrail:control or I@opencontrail:collector or I@opencontrail:compute', nodetool_target='I@opencontrail:control or I@opencontrail:collector', compute_target='I@opencontrail:compute', target_type='compound', nodetool_target_type='compound', compute_target_type='compound', nodetool_expected_size=3, proc_min_uptime=30, ignore_dead=False, **kwargs):
 
     ''' Verify contrail status returns nothing critical '''
 
     agent = "Contrail status"
+
+    # Check #1 contrail-status
     out = __salt__['saltutil.cmd']( tgt=target,
                                     tgt_type=target_type,
                                     fun='cmd.run',
@@ -225,9 +275,100 @@
     if not _failed_minions(out, agent, failed_minions):
         __context__['retcode'] = 2
         return False
-
     if kwargs.get("debug", False):
         logger.info(validated)
+
+    # Check #2 nodetool
+    out = __salt__['saltutil.cmd']( tgt=nodetool_target,
+                                    tgt_type=nodetool_target_type,
+                                    fun='cmd.run',
+                                    arg=['nodetool status'],
+                                    timeout=5
+                                  ) or None
+
+    if not _minions_output(out, agent, ignore_dead):
+        __context__['retcode'] = 2
+        return False
+
+    failed_minions = []
+    pattern = '^UN'
+    prog = re.compile(pattern)
+
+    validated = []
+    for minion in out:
+        size = 0
+        for line in out[minion]['ret'].split('\n'):
+            if prog.match(line):
+                size += 1
+        if not size == nodetool_expected_size and minion not in failed_minions:
+                failed_minions.append(minion)
+        validated.append(minion)
+
+    if not _failed_minions(out, agent, failed_minions):
+        __context__['retcode'] = 2
+        return False
+    if kwargs.get("debug", False):
+        logger.info(validated)
+
+    # Check #3 process status control
+    out = __salt__['saltutil.cmd']( tgt=nodetool_target,
+                                    tgt_type=nodetool_target_type,
+                                    fun='health_checks.contrail_process_list',
+                                    arg=['role=controller'],
+                                    timeout=5
+                                  ) or None
+
+    if not _minions_output(out, agent, ignore_dead):
+        __context__['retcode'] = 2
+        return False
+
+    failed_minions = []
+    validated = []
+    for minion in out:
+        procs = out[minion]['ret']
+        for proc in procs:
+            proc_uptime = procs[proc]['uptime']
+            if proc_uptime < proc_min_uptime:
+                if minion not in failed_minions:
+                    failed_minions.append(minion)
+                    logger.error({'minion': minion, 'name': proc, 'uptime': proc_uptime})
+        validated.append(minion)
+
+    if not _failed_minions(out, agent, failed_minions):
+        __context__['retcode'] = 2
+        return False
+    if kwargs.get("debug", False):
+        logger.info(validated)
+
+    # Check #4 process status computes
+    out = __salt__['saltutil.cmd']( tgt=compute_target,
+                                    tgt_type=compute_target_type,
+                                    fun='health_checks.contrail_process_list',
+                                    timeout=5
+                                  ) or None
+
+    if not _minions_output(out, agent, ignore_dead):
+        __context__['retcode'] = 2
+        return False
+
+    failed_minions = []
+    validated = []
+    for minion in out:
+        procs = out[minion]['ret']
+        for proc in procs:
+            proc_uptime = procs[proc]['uptime']
+            if proc_uptime < proc_min_uptime:
+                if minion not in failed_minions:
+                    failed_minions.append(minion)
+                    logger.error({'minion': minion, 'name': proc, 'uptime': proc_uptime})
+        validated.append(minion)
+
+    if not _failed_minions(out, agent, failed_minions):
+        __context__['retcode'] = 2
+        return False
+    if kwargs.get("debug", False):
+        logger.info(validated)
+
     return True