Merge "Add alerts and targets for prometheus LTS and relay"
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index df2b7c3..e4d3cb4 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -5,3 +5,5 @@
enabled: true
grafana:
enabled: true
+ telegraf:
+ enabled: true
diff --git a/prometheus/meta/prometheus.yml b/prometheus/meta/prometheus.yml
index 80768e2..4451e8f 100644
--- a/prometheus/meta/prometheus.yml
+++ b/prometheus/meta/prometheus.yml
@@ -1,5 +1,5 @@
{%- if pillar.prometheus is defined %}
-{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring with context %}
+{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring, relay with context %}
server:
alert:
{%- if server.get('enabled', False) %}
@@ -117,4 +117,105 @@
description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} alerts on the {{ $labels.instance }} instance are invalid for 2 minutes."
{%- endraw %}
{%- endif %}
+{%- if relay.get('enabled', False) %}
+{%- raw %}
+ PrometheusRelayServiceDown:
+ if: >-
+ procstat_running{process_name="prometheus-relay"} == 0
+ for: 2m
+ labels:
+ severity: minor
+ service: prometheus
+ annotations:
+ summary: "Prometheus relay service is down"
+ description: "The Prometheus relay service on the {{$labels.host}} node is down for 2 minutes."
+ PrometheusRelayServiceDownMajor:
+ if: >-
+ count(procstat_running{process_name="prometheus-relay"} == 0) >= count(procstat_running{process_name="prometheus-relay"}) * 0.5
+ for: 2m
+ labels:
+ severity: major
+ service: prometheus
+ annotations:
+ summary: "50% of Prometheus relay services are down"
+ description: "{{ $value }} of Prometheus relay services (>= 50%) are down for 2 minutes."
+ PrometheusRelayServiceOutage:
+ if: >-
+ count(procstat_running{process_name="prometheus-relay"} == 0) == count(procstat_running{process_name="prometheus-relay"})
+ for: 2m
+ labels:
+ severity: critical
+ service: prometheus
+ annotations:
+ summary: "Prometheus relay service outage"
+ description: "All Prometheus relay services are down for 2 minutes."
+{%- endraw %}
+{%- endif %}
+{%- if server.get("enabled", False) and not server.get("is_container", True) %}
+{%- raw %}
+ PrometheusLTSServiceDown:
+ if: >-
+ procstat_running{process_name="prometheus"} == 0
+ for: 2m
+ labels:
+ severity: minor
+ service: prometheus
+ annotations:
+ summary: "Prometheus Long Term Storage service is down"
+ description: "The Prometheus Long Term Storage service on the {{$labels.host}} node is down for 2 minutes."
+ PrometheusRelayServiceDownMajor:
+ if: >-
+ count(procstat_running{process_name="prometheus"} == 0) >= count(procstat_running{process_name="prometheus"}) * 0.5
+ for: 2m
+ labels:
+ severity: major
+ service: prometheus
+ annotations:
+ summary: "50% of Prometheus Long Term Storage services are down"
+ description: "{{ $value }} of Prometheus Long Term Storage services (>= 50%) are down for 2 minutes."
+ PrometheusRelayServiceOutage:
+ if: >-
+ count(procstat_running{process_name="prometheus"} == 0) == count(procstat_running{process_name="prometheus"})
+ for: 2m
+ labels:
+ severity: critical
+ service: prometheus
+ annotations:
+ summary: "Prometheus Long Term Storage service outage"
+ description: "All Prometheus Long Term Storage services are down for 2 minutes."
+{%- endraw %}
+{%- endif %}
+{%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) %}
+ {%- set addresses = [] %}
+ {%- if server.get('bind', {}).address is defined and not server.bind.address.startswith('127') and server.bind.address != '0.0.0.0' %}
+ {%- do addresses.append(server.bind.address) %}
+ {%- endif %}
+ {%- for address in grains['fqdn_ip4'] %}
+ {%- if not address.startswith('127') %}
+ {%- do addresses.append(address) %}
+ {%- endif %}
+ {%- endfor %}
+ target:
+ static:
+ prometheus_lts:
+ enabled: True
+ endpoint:
+ - address: {{ addresses[0] }}
+ port: {{ server.bind.port }}
+ relabel_configs:
+ - regex: {{ addresses[0] }}:{{ server.bind.port }}
+ replacement: {{ grains['host'] }}
+ source_labels: "__address__"
+ target_label: "host"
+ prometheus_relay:
+ enabled: True
+ endpoint:
+ - address: {{ addresses[0] }}
+ port: {{ relay.bind.port }}
+ relabel_configs:
+ - regex: {{ addresses[0] }}:{{ relay.bind.port }}
+ replacement: {{ grains['host'] }}
+ source_labels: "__address__"
+ target_label: "host"
+{%- endif %}
{%- endif %}
diff --git a/prometheus/meta/telegraf.yml b/prometheus/meta/telegraf.yml
new file mode 100644
index 0000000..3cce8a9
--- /dev/null
+++ b/prometheus/meta/telegraf.yml
@@ -0,0 +1,17 @@
+{%- if pillar.prometheus is defined %}
+{%- from "prometheus/map.jinja" import server, relay with context %}
+agent:
+ input:
+ {%- if relay.get('enabled', False) and server.get("enabled", False) and not server.get("is_container", True) %}
+ procstat:
+ process:
+ {%- if relay.get('enabled', False) %}
+ prometheus-relay:
+ pattern: '/usr/bin/prometheus-relay'
+ {%- endif %}
+ {%- if server.get("enabled", False) and not server.get("is_container", True) %}
+ prometheus:
+ pattern: '/usr/bin/prometheus[^-]'
+ {%- endif %}
+ {% endif %}
+{%- endif %}