Merge "Add alert and recording rules for OVS arping checker"
diff --git a/telegraf/files/script/check_opencontrail_ping.py b/telegraf/files/script/check_opencontrail_ping.py
new file mode 100644
index 0000000..1500dc2
--- /dev/null
+++ b/telegraf/files/script/check_opencontrail_ping.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python2
+
+import argparse
+import multiprocessing
+from multiprocessing.pool import ThreadPool
+import requests
+import socket
+import subprocess
+from xml.etree import ElementTree
+
+HOSTNAME = socket.gethostname()
+OPENCONTRAIL_URL = "http://localhost:8085/Snh_ItfReq"
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--host", default="mon")
+parser.add_argument("--port", default="15016")
+parser.add_argument("--processes", type=int, default=multiprocessing.cpu_count() * 2)
+args = parser.parse_args()
+
+PROMETHEUS_QUERY_API = "http://{}:{}/api/v1/query".format(args.host, args.port)
+
+
+def call_process(cmd):
+    p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output = p.communicate()
+    exit_code = p.poll()
+    return exit_code, output
+
+
+def instant_query(query):
+    params = {"query": query}
+    result = requests.get(PROMETHEUS_QUERY_API, params=params).json()["data"]["result"]
+    return result
+
+
+def get_opencontrail_vms():
+    response = requests.get(OPENCONTRAIL_URL)
+    tree = ElementTree.fromstring(response.content)
+    vms = {}
+    for data in tree.iter("ItfSandeshData"):
+        vm_uuid = data.find("vm_uuid").text
+        if not vm_uuid:
+            continue
+        ip_addr = data.find("ip_addr").text
+        mdata_ip_addr = data.find("mdata_ip_addr").text
+        addr = (ip_addr, mdata_ip_addr)
+        if vm_uuid in vms:
+            vms[vm_uuid].append(addr)
+        else:
+            vms[vm_uuid] = [addr]
+    return vms
+
+
+def check_output(data):
+    stdout = data[0]
+    transmitted = 0
+    received = -1
+    for line in stdout.split("\n"):
+        if 'transmitted' in line:
+            transmitted = int(line.split()[0])
+            received = int(line.split()[3])
+    return 1 if received == transmitted else 0
+
+
+def gather():
+    hosted_active_vms = set()
+    query = 'libvirt_domain_info_state{host="%s"}' % HOSTNAME
+    metrics = instant_query(query)
+    for metric in metrics:
+        instance_uuid = metric["metric"].get("instance_uuid", "")
+        instance_state = metric["value"][1]
+        if instance_uuid and instance_state == "1":
+            hosted_active_vms.add(instance_uuid)
+    checks = []
+    thread_pool = ThreadPool(args.processes)
+    vms = get_opencontrail_vms()
+    for instance in hosted_active_vms:
+        if instance not in vms:
+            print "instance_ping,id=%s valid=0" % instance
+            continue
+        addresses = vms[instance]
+        for ip, mdata_ip in addresses:
+            if not mdata_ip.startswith('169.254'):
+                print "instance_ping,id=%s valid=0" % instance
+                continue
+            print "instance_ping,id=%s valid=1" % instance
+            cmd = "ping -c3 -i0.2 -W1 %s" % mdata_ip
+            result = thread_pool.apply_async(call_process, (cmd,))
+            checks.append({"instance_uuid": instance, "result": result, "ip": ip, "mdata_ip": mdata_ip})
+    thread_pool.close()
+    thread_pool.join()
+    for check in checks:
+        exit_code, output = check["result"].get()
+        print "instance_ping,ip_address=%(ip_address)s,mdata_ip=%(mdata_ip)s,id=%(id)s success=%(success)s,exit_code=%(exit_code)s" % \
+              {
+                  'mdata_ip': check['mdata_ip'],
+                  'ip_address': check['ip'],
+                  'id': check['instance_uuid'],
+                  'success': check_output(output),
+                  'exit_code': exit_code
+              }
+
+if __name__ == "__main__":
+    try:
+        gather()
+        print "instance_ping check_up=1"
+    except Exception:
+        print "instance_ping check_up=0"
diff --git a/telegraf/meta/telegraf.yml b/telegraf/meta/telegraf.yml
index d0801fb..d05b4fd 100644
--- a/telegraf/meta/telegraf.yml
+++ b/telegraf/meta/telegraf.yml
@@ -10,6 +10,18 @@
   {%- endif %}
 {%- endif %}
 
+{%- if pillar.opencontrail is defined %}
+  {%- if pillar.opencontrail.get('compute', {}).get('enabled', False) == True %}
+    {%- set prometheus_address = pillar._param.stacklight_monitor_address %}
+agent:
+  input:
+    opencontrail_ping_check:
+      template: telegraf/files/input/exec.conf
+      commands: "/usr/local/bin/check_opencontrail_ping.py --host {{ prometheus_address }} --port 15016"
+      interval: 45s
+  {%- endif %}
+{%- endif %}
+
 {%- if pillar.telegraf.remote_agent is defined %}
   {%- set addresses = [] %}
   {%- for node_name, node_grains in salt['mine.get']('*', 'grains.items').items() %}
diff --git a/telegraf/script.sls b/telegraf/script.sls
index 6e058cf..f809924 100644
--- a/telegraf/script.sls
+++ b/telegraf/script.sls
@@ -10,3 +10,17 @@
 
   {%- endif %}
 {%- endif %}
+
+{%- if pillar.opencontrail is defined %}
+  {%- if pillar.opencontrail.get('compute', {}).get('enabled', False) == True %}
+    {%- set prometheus_address = pillar._param.stacklight_monitor_address %}
+
+opencontrail_ping_check_telegraf_script:
+  file.managed:
+  - name: /usr/local/bin/check_opencontrail_ping.py
+  - source: salt://telegraf/files/script/check_opencontrail_ping.py
+  - template: jinja
+  - mode: 755
+
+  {%- endif %}
+{%- endif %}