[Tooling update] health_checks

* Added:
- compute vrouter namespaces list

* Fixed:
- OC4.x usecase for contrail-status and nodetool
- Disabled process list check for OC4.x vrouters.

Prod-Related: PROD-31970

Change-Id: I7e9409c366b0bbae61b05882b6315d03b28c6c86
diff --git a/README.rst b/README.rst
index bad75e7..9d41c97 100644
--- a/README.rst
+++ b/README.rst
@@ -718,11 +718,18 @@
   salt-call health_checks.ntp_check
   salt-call health_checks.ntp_check min_peers=2 max_stratum=2
 
+List vrouter namespaces on contrail computes:
+
+.. code-block:: bash
+
+  salt -C 'I@opencontrail:compute' health_checks.list_namespaces
+
 Verify contrail nodes contrail-status output:
 
 .. code-block:: bash
 
   salt-call health_checks.contrail_check debug=True
+  salt-call health_checks.contrail_check nodetool_expected_size=2 debug=True
 
 Verify galera cluster status:
 
diff --git a/_modules/health_checks.py b/_modules/health_checks.py
index 9428fd5..ea34371 100644
--- a/_modules/health_checks.py
+++ b/_modules/health_checks.py
@@ -245,15 +245,27 @@
 
 def contrail_check(target='I@opencontrail:control or I@opencontrail:collector or I@opencontrail:compute', nodetool_target='I@opencontrail:control or I@opencontrail:collector', compute_target='I@opencontrail:compute', target_type='compound', nodetool_target_type='compound', compute_target_type='compound', nodetool_expected_size=3, proc_min_uptime=30, ignore_dead=False, **kwargs):
 
-    ''' Verify contrail status returns nothing critical '''
+    ''' Verify contrail infrastructure '''
+
+    use_doctrail = False
+    oc_ver = str(__salt__['pillar.get']('_param:opencontrail_version'))
+    if len(oc_ver) > 1:
+        if oc_ver[0] == '4':
+            use_doctrail = True
 
     agent = "Contrail status"
+    if use_doctrail:
+        # Compute nodes does not use doctrail still, but are in compund.
+        # In order to minimize salt calls we are using exception pipes
+        arg_cmd = 'test $(whereis -b doctrail | grep -c " ") -eq 0 && contrail-status || doctrail all contrail-status'
+    else:
+        arg_cmd = "contrail-status"
 
     # Check #1 contrail-status
     out = __salt__['saltutil.cmd']( tgt=target,
                                     tgt_type=target_type,
                                     fun='cmd.run',
-                                    arg=['contrail-status'],
+                                    arg=[arg_cmd],
                                     timeout=5
                                   ) or None
 
@@ -262,13 +274,16 @@
         return False
 
     failed_minions = []
-    pattern = '^(==|$|\S+\s+(active|backup|inactive\s\(disabled\son\sboot\)))'
+    pattern = '^(==|\*+$|$|\S+\s+(active|backup|inactive\s\(disabled\son\sboot\)))'
     prog = re.compile(pattern)
 
     validated = []
     for minion in out:
         for line in out[minion]['ret'].split('\n'):
-            if not prog.match(line) and minion not in failed_minions:
+            check_line = True
+            if " FOR NODE " in line:
+                check_line = False
+            if check_line and not prog.match(line) and minion not in failed_minions:
                 failed_minions.append(minion)
         validated.append(minion)
 
@@ -278,11 +293,16 @@
     if kwargs.get("debug", False):
         logger.info(validated)
 
+    if use_doctrail:
+        arg_cmd = "doctrail all nodetool status"
+    else:
+        arg_cmd = "nodetool status"
+
     # Check #2 nodetool
     out = __salt__['saltutil.cmd']( tgt=nodetool_target,
                                     tgt_type=nodetool_target_type,
                                     fun='cmd.run',
-                                    arg=['nodetool status'],
+                                    arg=[arg_cmd],
                                     timeout=5
                                   ) or None
 
@@ -311,39 +331,80 @@
         logger.info(validated)
 
     # Check #3 process status control
-    out = __salt__['saltutil.cmd']( tgt=nodetool_target,
-                                    tgt_type=nodetool_target_type,
-                                    fun='health_checks.contrail_process_list',
-                                    arg=['role=controller'],
-                                    timeout=5
-                                  ) or None
 
-    if not _minions_output(out, agent, ignore_dead):
-        __context__['retcode'] = 2
-        return False
+    # Contrail 4.x does not produce pid info from contrail-status -d
+    # Will skip this check and use another method further
+    # TODO: check process list state for oc4 env
+    if not use_doctrail:
 
-    failed_minions = []
-    validated = []
-    for minion in out:
-        procs = out[minion]['ret']
-        for proc in procs:
-            proc_uptime = procs[proc]['uptime']
-            if proc_uptime < proc_min_uptime:
-                if minion not in failed_minions:
-                    failed_minions.append(minion)
-                    logger.error({'minion': minion, 'name': proc, 'uptime': proc_uptime})
-        validated.append(minion)
+        out = __salt__['saltutil.cmd']( tgt=nodetool_target,
+                                        tgt_type=nodetool_target_type,
+                                        fun='health_checks.contrail_process_list',
+                                        arg=['role=controller'],
+                                        timeout=5
+                                      ) or None
 
-    if not _failed_minions(out, agent, failed_minions):
-        __context__['retcode'] = 2
-        return False
-    if kwargs.get("debug", False):
-        logger.info(validated)
+        if not _minions_output(out, agent, ignore_dead):
+            __context__['retcode'] = 2
+            return False
+
+        failed_minions = []
+        validated = []
+        for minion in out:
+            procs = out[minion]['ret']
+            for proc in procs:
+                proc_uptime = procs[proc]['uptime']
+                if proc_uptime < proc_min_uptime:
+                    if minion not in failed_minions:
+                        failed_minions.append(minion)
+                        logger.error({'minion': minion, 'name': proc, 'uptime': proc_uptime})
+            validated.append(minion)
+
+        if not _failed_minions(out, agent, failed_minions):
+            __context__['retcode'] = 2
+            return False
+        if kwargs.get("debug", False):
+            logger.info(validated)
 
     # Check #4 process status computes
+
+    # Contrail 4.x does not produce pid info from contrail-status -d
+    # Will skip this check and use another method further
+    # TODO: check process list state for oc4 env
+    if not use_doctrail:
+
+        out = __salt__['saltutil.cmd']( tgt=compute_target,
+                                        tgt_type=compute_target_type,
+                                        fun='health_checks.contrail_process_list',
+                                        timeout=5
+                                      ) or None
+
+        if not _minions_output(out, agent, ignore_dead):
+            __context__['retcode'] = 2
+            return False
+
+        failed_minions = []
+        validated = []
+        for minion in out:
+            procs = out[minion]['ret']
+            for proc in procs:
+                proc_uptime = procs[proc]['uptime']
+                if proc_uptime < proc_min_uptime:
+                    if minion not in failed_minions:
+                        failed_minions.append(minion)
+                        logger.error({'minion': minion, 'name': proc, 'uptime': proc_uptime})
+            validated.append(minion)
+
+        if not _failed_minions(out, agent, failed_minions):
+            __context__['retcode'] = 2
+            return False
+        if kwargs.get("debug", False):
+            logger.info(validated)
+
+    # Check # 5 compute vrouter namespaces dumplicates check
     out = __salt__['saltutil.cmd']( tgt=compute_target,
                                     tgt_type=compute_target_type,
-                                    fun='health_checks.contrail_process_list',
+                                    fun='health_checks.list_namespaces',
                                     timeout=5
                                   ) or None
 
@@ -353,22 +414,26 @@
 
     failed_minions = []
     validated = []
+    all_namespaces = []
     for minion in out:
-        procs = out[minion]['ret']
-        for proc in procs:
-            proc_uptime = procs[proc]['uptime']
-            if proc_uptime < proc_min_uptime:
+        namespaces = out[minion]['ret']
+        for ns in namespaces:
+            if ns['uuid'] not in all_namespaces:
+                all_namespaces.append(ns['uuid'])
+            else:
                 if minion not in failed_minions:
                     failed_minions.append(minion)
-                    logger.error({'minion': minion, 'name': proc, 'uptime': proc_uptime})
-        validated.append(minion)
+                    logger.error({'minion': minion, 'uuid': ns['uuid']})
+            validated.append(minion)
 
     if not _failed_minions(out, agent, failed_minions):
+        logger.error("Duplicated SNAT vrouters found. Please reset their gateways")
         __context__['retcode'] = 2
         return False
     if kwargs.get("debug", False):
         logger.info(validated)
 
+    # TODO: peers check
     return True
 
 
@@ -1769,3 +1834,32 @@
 
     return True
 
+
+def list_namespaces(raw_output=False):
+
+    ''' JSON formatted ip netns dict '''
+
+    proc = subprocess.Popen(['ip', 'netns'], stdout=subprocess.PIPE)
+    stdout, stderr =  proc.communicate()
+
+    namespaces = []
+
+    for line in stdout.split('\n'):
+        if len(line) > 0:
+            netns = {}
+            if raw_output:
+                netns['id'] = -2
+                netns['uuid'] = line
+            else:
+                line_splitted = line.split()
+                if len(line_splitted) > 1:
+                    ns_uuid = line_splitted[0]
+                    ns_id = int(line_splitted[2][:-1])
+                else:
+                    ns_uuid = line
+                    ns_id = -1
+                netns['id'] = ns_id
+                netns['uuid'] = ns_uuid
+            namespaces.append(netns)
+
+    return namespaces