Merge "Add alert and recording rules for OVS arping checker"
diff --git a/telegraf/files/script/check_opencontrail_ping.py b/telegraf/files/script/check_opencontrail_ping.py
new file mode 100644
index 0000000..1500dc2
--- /dev/null
+++ b/telegraf/files/script/check_opencontrail_ping.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python2
+
+import argparse
+import multiprocessing
+from multiprocessing.pool import ThreadPool
+import requests
+import socket
+import subprocess
+from xml.etree import ElementTree
+
+HOSTNAME = socket.gethostname()
+OPENCONTRAIL_URL = "http://localhost:8085/Snh_ItfReq"
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--host", default="mon")
+parser.add_argument("--port", default="15016")
+parser.add_argument("--processes", type=int, default=multiprocessing.cpu_count() * 2)
+args = parser.parse_args()
+
+PROMETHEUS_QUERY_API = "http://{}:{}/api/v1/query".format(args.host, args.port)
+
+
+def call_process(cmd):
+ p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ output = p.communicate()
+ exit_code = p.poll()
+ return exit_code, output
+
+
+def instant_query(query):
+ params = {"query": query}
+ result = requests.get(PROMETHEUS_QUERY_API, params=params).json()["data"]["result"]
+ return result
+
+
+def get_opencontrail_vms():
+ response = requests.get(OPENCONTRAIL_URL)
+ tree = ElementTree.fromstring(response.content)
+ vms = {}
+ for data in tree.iter("ItfSandeshData"):
+ vm_uuid = data.find("vm_uuid").text
+ if not vm_uuid:
+ continue
+ ip_addr = data.find("ip_addr").text
+ mdata_ip_addr = data.find("mdata_ip_addr").text
+ addr = (ip_addr, mdata_ip_addr)
+ if vm_uuid in vms:
+ vms[vm_uuid].append(addr)
+ else:
+ vms[vm_uuid] = [addr]
+ return vms
+
+
+def check_output(data):
+ stdout = data[0]
+ transmitted = 0
+ received = -1
+ for line in stdout.split("\n"):
+ if 'transmitted' in line:
+ transmitted = int(line.split()[0])
+ received = int(line.split()[3])
+ return 1 if received == transmitted else 0
+
+
+def gather():
+ hosted_active_vms = set()
+ query = 'libvirt_domain_info_state{host="%s"}' % HOSTNAME
+ metrics = instant_query(query)
+ for metric in metrics:
+ instance_uuid = metric["metric"].get("instance_uuid", "")
+ instance_state = metric["value"][1]
+ if instance_uuid and instance_state == "1":
+ hosted_active_vms.add(instance_uuid)
+ checks = []
+ thread_pool = ThreadPool(args.processes)
+ vms = get_opencontrail_vms()
+ for instance in hosted_active_vms:
+ if instance not in vms:
+ print "instance_ping,id=%s valid=0" % instance
+ continue
+ addresses = vms[instance]
+ for ip, mdata_ip in addresses:
+ if not mdata_ip.startswith('169.254'):
+ print "instance_ping,id=%s valid=0" % instance
+ continue
+ print "instance_ping,id=%s valid=1" % instance
+ cmd = "ping -c3 -i0.2 -W1 %s" % mdata_ip
+ result = thread_pool.apply_async(call_process, (cmd,))
+ checks.append({"instance_uuid": instance, "result": result, "ip": ip, "mdata_ip": mdata_ip})
+ thread_pool.close()
+ thread_pool.join()
+ for check in checks:
+ exit_code, output = check["result"].get()
+ print "instance_ping,ip_address=%(ip_address)s,mdata_ip=%(mdata_ip)s,id=%(id)s success=%(success)s,exit_code=%(exit_code)s" % \
+ {
+ 'mdata_ip': check['mdata_ip'],
+ 'ip_address': check['ip'],
+ 'id': check['instance_uuid'],
+ 'success': check_output(output),
+ 'exit_code': exit_code
+ }
+
+if __name__ == "__main__":
+ try:
+ gather()
+ print "instance_ping check_up=1"
+ except Exception:
+ print "instance_ping check_up=0"
diff --git a/telegraf/meta/telegraf.yml b/telegraf/meta/telegraf.yml
index d0801fb..d05b4fd 100644
--- a/telegraf/meta/telegraf.yml
+++ b/telegraf/meta/telegraf.yml
@@ -10,6 +10,18 @@
{%- endif %}
{%- endif %}
+{%- if pillar.opencontrail is defined %}
+ {%- if pillar.opencontrail.get('compute', {}).get('enabled', False) == True %}
+ {%- set prometheus_address = pillar._param.stacklight_monitor_address %}
+agent:
+ input:
+ opencontrail_ping_check:
+ template: telegraf/files/input/exec.conf
+ commands: "/usr/local/bin/check_opencontrail_ping.py --host {{ prometheus_address }} --port 15016"
+ interval: 45s
+ {%- endif %}
+{%- endif %}
+
{%- if pillar.telegraf.remote_agent is defined %}
{%- set addresses = [] %}
{%- for node_name, node_grains in salt['mine.get']('*', 'grains.items').items() %}
diff --git a/telegraf/script.sls b/telegraf/script.sls
index 6e058cf..f809924 100644
--- a/telegraf/script.sls
+++ b/telegraf/script.sls
@@ -10,3 +10,17 @@
{%- endif %}
{%- endif %}
+
+{%- if pillar.opencontrail is defined %}
+ {%- if pillar.opencontrail.get('compute', {}).get('enabled', False) == True %}
+ {%- set prometheus_address = pillar._param.stacklight_monitor_address %}
+
+opencontrail_ping_check_telegraf_script:
+ file.managed:
+ - name: /usr/local/bin/check_opencontrail_ping.py
+ - source: salt://telegraf/files/script/check_opencontrail_ping.py
+ - template: jinja
+ - mode: 755
+
+ {%- endif %}
+{%- endif %}