Split out Telegraf gathering error alerts for remote Telegraf instances.
Fixes: PROD-35022
Change-Id: I5ac8d15d392da5882124f38e2f995a492df00afc
diff --git a/telegraf/meta/prometheus.yml b/telegraf/meta/prometheus.yml
index 08c139d..eef9957 100644
--- a/telegraf/meta/prometheus.yml
+++ b/telegraf/meta/prometheus.yml
@@ -28,13 +28,22 @@
description: "The {{ $labels.host }} node is unreachable at {{ $labels.url }}, the Telegraf and Fluentd targets on the {{ $labels.host }} node are down."
TelegrafGatherErrors:
if: >-
- rate(internal_agent_gather_errors[10m]) > 0
+ rate(internal_agent_gather_errors{job!="remote_agent"}[10m]) > 0
labels:
severity: major
service: telegraf
annotations:
summary: "Telegraf failed to gather metrics"
description: "Telegraf has gathering errors on the {{ $labels.host }} node for the last 10 minutes."
+ TelegrafRemoteGatherErrors:
+ if: >-
+ rate(internal_agent_gather_errors{job="remote_agent"}[10m]) > 0
+ labels:
+ severity: major
+ service: telegraf
+ annotations:
+ summary: "Remote Telegraf failed to gather metrics"
+ description: "Remote Telegraf has gathering errors for the last 10 minutes. Check 'monitoring_remote_agent' container's logs for details."
{%- endraw %}
{%- if pillar.reclass is defined %}
{%- if pillar.reclass.get('storage', {}).get('enabled', False) and pillar.reclass.get('storage', {}).get('data_source',{}).get('engine',"") == "local" %}