Add Telegraf gather errors alert
Change-Id: I6b3b7c86a21a83f1cc44991a235f063e27ae747c
Related-PROD: PROD-30675
diff --git a/telegraf/meta/prometheus.yml b/telegraf/meta/prometheus.yml
index 46d7fe3..258cd81 100644
--- a/telegraf/meta/prometheus.yml
+++ b/telegraf/meta/prometheus.yml
@@ -27,6 +27,17 @@
summary: "The {{ $labels.host }} node is down"
description: "The {{ $labels.host }} node is unreachable at {{ $labels.url }}, the Telegraf and Fluentd targets on the {{ $labels.host }} node are down."
{%- endraw %}
+ TelegrafGatherErrors:
+ if: >-
+ rate(internal_agent_gather_errors[10m]) > 0
+ labels:
+ severity: major
+ service: telegraf
+ annotations:
+ summary: "Telegraf failed to gather metrics"
+{%- raw %}
+ description: "Telegraf has gathering errors on the {{ $labels.host }} node for the last 10 minutes."
+{%- endraw %}
{%- if pillar.neutron is defined %}
{%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or (pillar.neutron.get('compute',{}).get('enabled', False) == True and pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True) %}
OVSInstanceArpingCheckDown: