Improve opencontrail ping script

- add more granular exception handling in python script

Change-Id: I96d3648fc82c7650734ae490d9f0669f6b3b15bf
Related-bug: PROD-35586
diff --git a/telegraf/files/script/check_opencontrail_ping.py b/telegraf/files/script/check_opencontrail_ping.py
index 4560910..1f260dc 100644
--- a/telegraf/files/script/check_opencontrail_ping.py
+++ b/telegraf/files/script/check_opencontrail_ping.py
@@ -1,106 +1,146 @@
 #!/usr/bin/env python2
 
 import argparse
+from collections import defaultdict
 import multiprocessing
 from multiprocessing.pool import ThreadPool
-import requests
 import socket
+import simplejson
 import subprocess
+import sys
+
+import requests
 from xml.etree import ElementTree
 
 HOSTNAME = socket.gethostname()
-OPENCONTRAIL_URL = "http://localhost:8085/Snh_ItfReq"
+OPENCONTRAIL_URL = 'http://localhost:8085/Snh_ItfReq'
 
 parser = argparse.ArgumentParser()
-parser.add_argument("--host", default="mon")
-parser.add_argument("--port", default="15010")
-parser.add_argument("--processes", type=int, default=multiprocessing.cpu_count() * 2)
+parser.add_argument('--host', default='mon')
+parser.add_argument('--port', default='15010')
+parser.add_argument('--processes', type=int,
+                    default=multiprocessing.cpu_count() * 2)
+
 args = parser.parse_args()
 
-PROMETHEUS_QUERY_API = "http://{}:{}/api/v1/query".format(args.host, args.port)
+PROMETHEUS_QUERY_API = 'http://{}:{}/api/v1/query'.format(args.host, args.port)
 
 
 def call_process(cmd):
-    p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    output = p.communicate()
-    return output
+    p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE)
+    return p.communicate()
 
 
 def instant_query(query):
-    params = {"query": query}
-    result = requests.get(PROMETHEUS_QUERY_API, params=params).json()["data"]["result"]
+    params = {'query': query}
+    response = requests.get(PROMETHEUS_QUERY_API, params=params)
+    try:
+        result = response.json()['data']['result']
+    except (ValueError, simplejson.scanner.JSONDecodeError, KeyError) as e:
+        msg = '# Query error {}: {}\n'.format(e.__class__.__name__, e.message)
+        sys.stderr.write(msg)
+        return
     return result
 
 
 def get_opencontrail_vms():
     response = requests.get(OPENCONTRAIL_URL)
     tree = ElementTree.fromstring(response.content)
-    vms = {}
-    for data in tree.iter("ItfSandeshData"):
-        vm_uuid = data.find("vm_uuid").text
+    vms = defaultdict(list)
+
+    for data in tree.iter('ItfSandeshData'):
+        vm_uuid = data.find('vm_uuid').text
         if not vm_uuid:
             continue
-        ip_addr = data.find("ip_addr").text
-        mdata_ip_addr = data.find("mdata_ip_addr").text
+        ip_addr = data.find('ip_addr').text
+        mdata_ip_addr = data.find('mdata_ip_addr').text
         addr = (ip_addr, mdata_ip_addr)
-        if vm_uuid in vms:
-            vms[vm_uuid].append(addr)
-        else:
-            vms[vm_uuid] = [addr]
+        vms[vm_uuid].append(addr)
+
     return vms
 
 
-def check_output(data):
-    stdout = data[0]
+def all_packets_received(data):
     transmitted = 0
     received = -1
-    for line in stdout.split("\n"):
+    for line in data[0].split('\n'):
         if 'transmitted' in line:
             transmitted = int(line.split()[0])
             received = int(line.split()[3])
-    return 1 if received == transmitted else 0
+    return int(received == transmitted)
 
 
-def gather():
+def gather(metrics):
     hosted_active_vms = set()
-    query = 'libvirt_domain_info_state{host="%s"}' % HOSTNAME
-    metrics = instant_query(query)
     for metric in metrics:
-        instance_uuid = metric["metric"].get("instance_uuid", "")
-        instance_state = metric["value"][1]
-        if instance_uuid and instance_state == "1":
+        instance_uuid = metric['metric'].get('instance_uuid', '')
+        instance_state = metric['value'][1]
+        if instance_uuid and instance_state == '1':
             hosted_active_vms.add(instance_uuid)
+
     checks = []
     thread_pool = ThreadPool(args.processes)
     vms = get_opencontrail_vms()
+
+    ping_valid_tmp = 'instance_ping,id={instance} valid={valid}\n'
+
     for instance in hosted_active_vms:
         if instance not in vms:
-            print "instance_ping,id=%s valid=0" % instance
+            sys.stdout.write(ping_valid_tmp.format(instance=instance,
+                                                   valid=0))
             continue
         addresses = vms[instance]
         for ip, mdata_ip in addresses:
             if not mdata_ip.startswith('169.254'):
-                print "instance_ping,id=%s valid=0" % instance
+                sys.stdout.write(ping_valid_tmp.format(instance=instance,
+                                                       valid=0))
                 continue
-            print "instance_ping,id=%s valid=1" % instance
-            cmd = "ping -c3 -i0.2 -W1 %s" % mdata_ip
+            sys.stdout.write(ping_valid_tmp.format(instance=instance,
+                                                   valid=1))
+            cmd = 'ping -c3 -i0.2 -W1 {}'.format(mdata_ip)
             result = thread_pool.apply_async(call_process, (cmd,))
-            checks.append({"instance_uuid": instance, "result": result, "ip": ip, "mdata_ip": mdata_ip})
+            checks.append({
+                'instance_uuid': instance,
+                'result': result,
+                'ip': ip,
+                'mdata_ip': mdata_ip,
+            })
+
     thread_pool.close()
     thread_pool.join()
-    for check in checks:
-        output = check["result"].get()
-        print "instance_ping,ip_address=%(ip_address)s,mdata_ip=%(mdata_ip)s,id=%(id)s success=%(success)s" % \
-              {
-                  'mdata_ip': check['mdata_ip'],
-                  'ip_address': check['ip'],
-                  'id': check['instance_uuid'],
-                  'success': check_output(output),
-              }
 
-if __name__ == "__main__":
+    ping_result_tmp = ('instance_ping,ip_address={ip_address},'
+                       'mdata_ip={mdata_ip},id={id} success={success}\n')
+
+    for check in checks:
+        output = check['result'].get()
+        success = all_packets_received(output)
+        fmt_input = {
+            'mdata_ip': check['mdata_ip'],
+            'ip_address': check['ip'],
+            'id': check['instance_uuid'],
+            'success': success,
+        }
+        sys.stdout.write(ping_result_tmp.format(**fmt_input))
+
+
+def main():
     try:
-        gather()
-        print "instance_ping check_up=1"
-    except Exception:
-        print "instance_ping check_up=0"
+        query = 'libvirt_domain_info_state{host="%s"}' % HOSTNAME
+        metrics = instant_query(query)
+
+        if metrics is None:
+            sys.stdout.write('instance_ping check_up=0\n')
+            return
+
+        gather(metrics)
+        sys.stdout.write('instance_ping check_up=1\n')
+    except Exception as e:
+        sys.stdout.write('instance_ping check_up=0\n')
+        msg = '# Check error {}: {}\n'.format(e.__class__.__name__, e.message)
+        sys.stderr.write(msg)
+
+
+if __name__ == '__main__':
+    main()