Add alerts/rr for the Opencontrail ping check
Change-Id: I098be542d5600d4cc5e9000395c92249271a1a82
Related-PROD: PROD-28090
diff --git a/telegraf/files/script/check_opencontrail_ping.py b/telegraf/files/script/check_opencontrail_ping.py
index 1500dc2..4a32c22 100644
--- a/telegraf/files/script/check_opencontrail_ping.py
+++ b/telegraf/files/script/check_opencontrail_ping.py
@@ -23,8 +23,7 @@
def call_process(cmd):
p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output = p.communicate()
- exit_code = p.poll()
- return exit_code, output
+ return output
def instant_query(query):
@@ -90,14 +89,13 @@
thread_pool.close()
thread_pool.join()
for check in checks:
- exit_code, output = check["result"].get()
- print "instance_ping,ip_address=%(ip_address)s,mdata_ip=%(mdata_ip)s,id=%(id)s success=%(success)s,exit_code=%(exit_code)s" % \
+ output = check["result"].get()
+ print "instance_ping,ip_address=%(ip_address)s,mdata_ip=%(mdata_ip)s,id=%(id)s success=%(success)s" % \
{
'mdata_ip': check['mdata_ip'],
'ip_address': check['ip'],
'id': check['instance_uuid'],
'success': check_output(output),
- 'exit_code': exit_code
}
if __name__ == "__main__":
diff --git a/telegraf/meta/prometheus.yml b/telegraf/meta/prometheus.yml
index 87ba22b..e5f57a2 100644
--- a/telegraf/meta/prometheus.yml
+++ b/telegraf/meta/prometheus.yml
@@ -37,9 +37,28 @@
service: ovs
annotations:
summary: "The OVS instance arping check is down"
-{%- raw %}
+ {%- raw %}
description: "The OVS instance arping check on the {{ $labels.host }} node is down for 2 minutes."
-{%- endraw %}
+ {%- endraw %}
+ {%- endif %}
+{%- endif %}
+{%- if pillar.opencontrail is defined %}
+ {%- if pillar.opencontrail.get('compute', {}).get('enabled', False) == True %}
+ OpencontrailInstancePingCheckDown:
+ if: instance_ping_check_up == 0
+ for: 2m
+ labels:
+ severity: major
+ service: contrail
+ annotations:
+ summary: "The Opencontrail instance ping check is down"
+ {%- raw %}
+ description: "The Opencontrail instance ping check on the {{ $labels.host }} node is down for 2 minutes."
+ {%- endraw %}
+ {%- endif %}
+{%- endif %}
+{%- if pillar.neutron is defined %}
+ {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or (pillar.neutron.get('compute',{}).get('enabled', False) == True and pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True) %}
recording:
instance_id:instance_arping_success:
query: >-
@@ -55,6 +74,23 @@
count(instance_id:instance_arping_success:avg10m:for10m == 0)
{%- endif %}
{%- endif %}
+{%- if pillar.opencontrail is defined %}
+ {%- if pillar.opencontrail.get('compute', {}).get('enabled', False) == True %}
+ recording:
+ instance_id:instance_ping_success:
+ query: >-
+ avg(instance_ping_success) by (id) * on(id) instance_ping_valid or on(id) instance_ping_valid
+ instance_id:instance_ping_success:avg10m:for10m:
+ query: >-
+ avg_over_time(instance_id:instance_ping_success[10m]) and instance_id:instance_ping_success and instance_id:instance_ping_success offset 10m
+ total:instance_id:instance_ping_success:avg10m:for10m:
+ query: >-
+ count(instance_id:instance_ping_success:avg10m:for10m)
+ total:instance_id:instance_ping_success:avg10m:for10m:eq0:
+ query: >-
+ count(instance_id:instance_ping_success:avg10m:for10m == 0)
+ {%- endif %}
+{%- endif %}
{%- if address is defined %}
target:
static: