Added new alert for Keepalived cluster outage
Change-Id: I60f76a916ba3bca4913b55635d4d8afbbf285837
Related-Bug: PROD-19537
diff --git a/keepalived/meta/prometheus.yml b/keepalived/meta/prometheus.yml
index a73f292..2c2741e 100644
--- a/keepalived/meta/prometheus.yml
+++ b/keepalived/meta/prometheus.yml
@@ -25,7 +25,8 @@
description: 'Keepalived is in the FAILED state on node {{ $labels.host }}'
{% endraw %}
KeepalivedMultipleIPAddr:
- if: 'count(ipcheck_assigned) by (ip) > 1'
+ if: >-
+ count(ipcheck_assigned) by (ip) > 1
{% raw %}
for: 2m
labels:
@@ -35,4 +36,15 @@
summary: 'Single IP is assigned more than once'
description: 'The IP: {{ $labels.ip }} is assigned more than once'
{% endraw %}
+ KeepalivedServiceOutage:
+ if: >-
+ count(label_replace(procstat_running{process_name="keepalived"}, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster) == count(label_replace(procstat_running{process_name="keepalived"} == 0, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster)
+ {% raw %}
+ labels:
+ severity: critical
+ service: keepalived
+ annotations:
+ summary: 'Keepalived cluster service outage'
+ description: 'All keepalived processes within cluster {{ $labels.cluster }} are down. Keepalived service is not available'
+ {% endraw %}
{%- endif %}