Improve opencontrail ping script
- add more granular exception handling in python script
Change-Id: I96d3648fc82c7650734ae490d9f0669f6b3b15bf
Related-bug: PROD-35586
diff --git a/telegraf/files/script/check_opencontrail_ping.py b/telegraf/files/script/check_opencontrail_ping.py
index 4560910..1f260dc 100644
--- a/telegraf/files/script/check_opencontrail_ping.py
+++ b/telegraf/files/script/check_opencontrail_ping.py
@@ -1,106 +1,146 @@
#!/usr/bin/env python2
import argparse
+from collections import defaultdict
import multiprocessing
from multiprocessing.pool import ThreadPool
-import requests
import socket
+import simplejson
import subprocess
+import sys
+
+import requests
from xml.etree import ElementTree
HOSTNAME = socket.gethostname()
-OPENCONTRAIL_URL = "http://localhost:8085/Snh_ItfReq"
+OPENCONTRAIL_URL = 'http://localhost:8085/Snh_ItfReq'
parser = argparse.ArgumentParser()
-parser.add_argument("--host", default="mon")
-parser.add_argument("--port", default="15010")
-parser.add_argument("--processes", type=int, default=multiprocessing.cpu_count() * 2)
+parser.add_argument('--host', default='mon')
+parser.add_argument('--port', default='15010')
+parser.add_argument('--processes', type=int,
+ default=multiprocessing.cpu_count() * 2)
+
args = parser.parse_args()
-PROMETHEUS_QUERY_API = "http://{}:{}/api/v1/query".format(args.host, args.port)
+PROMETHEUS_QUERY_API = 'http://{}:{}/api/v1/query'.format(args.host, args.port)
def call_process(cmd):
- p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- output = p.communicate()
- return output
+ p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ return p.communicate()
def instant_query(query):
- params = {"query": query}
- result = requests.get(PROMETHEUS_QUERY_API, params=params).json()["data"]["result"]
+ params = {'query': query}
+ response = requests.get(PROMETHEUS_QUERY_API, params=params)
+ try:
+ result = response.json()['data']['result']
+ except (ValueError, simplejson.scanner.JSONDecodeError, KeyError) as e:
+ msg = '# Query error {}: {}\n'.format(e.__class__.__name__, e.message)
+ sys.stderr.write(msg)
+ return
return result
def get_opencontrail_vms():
response = requests.get(OPENCONTRAIL_URL)
tree = ElementTree.fromstring(response.content)
- vms = {}
- for data in tree.iter("ItfSandeshData"):
- vm_uuid = data.find("vm_uuid").text
+ vms = defaultdict(list)
+
+ for data in tree.iter('ItfSandeshData'):
+ vm_uuid = data.find('vm_uuid').text
if not vm_uuid:
continue
- ip_addr = data.find("ip_addr").text
- mdata_ip_addr = data.find("mdata_ip_addr").text
+ ip_addr = data.find('ip_addr').text
+ mdata_ip_addr = data.find('mdata_ip_addr').text
addr = (ip_addr, mdata_ip_addr)
- if vm_uuid in vms:
- vms[vm_uuid].append(addr)
- else:
- vms[vm_uuid] = [addr]
+ vms[vm_uuid].append(addr)
+
return vms
-def check_output(data):
- stdout = data[0]
+def all_packets_received(data):
transmitted = 0
received = -1
- for line in stdout.split("\n"):
+ for line in data[0].split('\n'):
if 'transmitted' in line:
transmitted = int(line.split()[0])
received = int(line.split()[3])
- return 1 if received == transmitted else 0
+ return int(received == transmitted)
-def gather():
+def gather(metrics):
hosted_active_vms = set()
- query = 'libvirt_domain_info_state{host="%s"}' % HOSTNAME
- metrics = instant_query(query)
for metric in metrics:
- instance_uuid = metric["metric"].get("instance_uuid", "")
- instance_state = metric["value"][1]
- if instance_uuid and instance_state == "1":
+ instance_uuid = metric['metric'].get('instance_uuid', '')
+ instance_state = metric['value'][1]
+ if instance_uuid and instance_state == '1':
hosted_active_vms.add(instance_uuid)
+
checks = []
thread_pool = ThreadPool(args.processes)
vms = get_opencontrail_vms()
+
+ ping_valid_tmp = 'instance_ping,id={instance} valid={valid}\n'
+
for instance in hosted_active_vms:
if instance not in vms:
- print "instance_ping,id=%s valid=0" % instance
+ sys.stdout.write(ping_valid_tmp.format(instance=instance,
+ valid=0))
continue
addresses = vms[instance]
for ip, mdata_ip in addresses:
if not mdata_ip.startswith('169.254'):
- print "instance_ping,id=%s valid=0" % instance
+ sys.stdout.write(ping_valid_tmp.format(instance=instance,
+ valid=0))
continue
- print "instance_ping,id=%s valid=1" % instance
- cmd = "ping -c3 -i0.2 -W1 %s" % mdata_ip
+ sys.stdout.write(ping_valid_tmp.format(instance=instance,
+ valid=1))
+ cmd = 'ping -c3 -i0.2 -W1 {}'.format(mdata_ip)
result = thread_pool.apply_async(call_process, (cmd,))
- checks.append({"instance_uuid": instance, "result": result, "ip": ip, "mdata_ip": mdata_ip})
+ checks.append({
+ 'instance_uuid': instance,
+ 'result': result,
+ 'ip': ip,
+ 'mdata_ip': mdata_ip,
+ })
+
thread_pool.close()
thread_pool.join()
- for check in checks:
- output = check["result"].get()
- print "instance_ping,ip_address=%(ip_address)s,mdata_ip=%(mdata_ip)s,id=%(id)s success=%(success)s" % \
- {
- 'mdata_ip': check['mdata_ip'],
- 'ip_address': check['ip'],
- 'id': check['instance_uuid'],
- 'success': check_output(output),
- }
-if __name__ == "__main__":
+ ping_result_tmp = ('instance_ping,ip_address={ip_address},'
+ 'mdata_ip={mdata_ip},id={id} success={success}\n')
+
+ for check in checks:
+ output = check['result'].get()
+ success = all_packets_received(output)
+ fmt_input = {
+ 'mdata_ip': check['mdata_ip'],
+ 'ip_address': check['ip'],
+ 'id': check['instance_uuid'],
+ 'success': success,
+ }
+ sys.stdout.write(ping_result_tmp.format(**fmt_input))
+
+
+def main():
try:
- gather()
- print "instance_ping check_up=1"
- except Exception:
- print "instance_ping check_up=0"
+ query = 'libvirt_domain_info_state{host="%s"}' % HOSTNAME
+ metrics = instant_query(query)
+
+ if metrics is None:
+ sys.stdout.write('instance_ping check_up=0\n')
+ return
+
+ gather(metrics)
+ sys.stdout.write('instance_ping check_up=1\n')
+ except Exception as e:
+ sys.stdout.write('instance_ping check_up=0\n')
+ msg = '# Check error {}: {}\n'.format(e.__class__.__name__, e.message)
+ sys.stderr.write(msg)
+
+
+if __name__ == '__main__':
+ main()