blob: 505b3f3a5876f412db7f17b737e948a539acd819 [file] [log] [blame]
{%- from "keepalived/map.jinja" import cluster with context %}
{%- if cluster.get('enabled', False) %}
server:
alert:
KeepalivedProcessDown:
if: >-
procstat_running{process_name="keepalived"} == 0
{% raw %}
labels:
severity: major
service: keepalived
annotations:
summary: "Keepalived process is down"
description: "The Keepalived process on the {{ $labels.host }} node is down."
{% endraw %}
KeepalivedFailedState:
if: >-
keepalived_state == 0
{% raw %}
labels:
severity: minor
service: keepalived
annotations:
summary: "Keepalived state is FAILED"
description: "Keepalived is in the FAILED state on the {{ $labels.host }} node."
{% endraw %}
KeepalivedMultipleIPAddr:
if: >-
count(ipcheck_assigned) by (ip) > 1
{% raw %}
for: 2m
labels:
severity: major
service: keepalived
annotations:
summary: "Keepalived VIP is assigned more than once"
description: "The Keepalived {{ $labels.ip }} virtual IP is assigned more than once."
{% endraw %}
KeepalivedServiceOutage:
if: >-
count(label_replace(procstat_running{process_name="keepalived"}, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster) == count(label_replace(procstat_running{process_name="keepalived"} == 0, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster)
{% raw %}
labels:
severity: critical
service: keepalived
annotations:
summary: "Keepalived service outage"
description: "All Keepalived processes within the {{ $labels.cluster }} cluster are down."
{% endraw %}
{%- endif %}