| {%- from "keepalived/map.jinja" import cluster with context %} |
| {%- if cluster.get('enabled', False) %} |
| server: |
| alert: |
| KeepalivedProcessDown: |
| if: >- |
| procstat_running{process_name="keepalived"} == 0 |
| {% raw %} |
| labels: |
| severity: major |
| service: keepalived |
| annotations: |
| summary: "Keepalived process is down" |
| description: "The Keepalived process on the {{ $labels.host }} node is down." |
| {% endraw %} |
| KeepalivedFailedState: |
| if: >- |
| keepalived_state == 0 |
| {% raw %} |
| labels: |
| severity: minor |
| service: keepalived |
| annotations: |
| summary: "Keepalived state is FAILED" |
| description: "Keepalived is in the FAILED state on the {{ $labels.host }} node." |
| {% endraw %} |
| KeepalivedMultipleIPAddr: |
| if: >- |
| count(ipcheck_assigned) by (ip) > 1 |
| {% raw %} |
| for: 2m |
| labels: |
| severity: major |
| service: keepalived |
| annotations: |
| summary: "Keepalived VIP is assigned more than once" |
| description: "The Keepalived {{ $labels.ip }} virtual IP is assigned more than once." |
| {% endraw %} |
| KeepalivedServiceOutage: |
| if: >- |
| count(label_replace(procstat_running{process_name="keepalived"}, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster) == count(label_replace(procstat_running{process_name="keepalived"} == 0, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster) |
| {% raw %} |
| labels: |
| severity: critical |
| service: keepalived |
| annotations: |
| summary: "Keepalived service outage" |
| description: "All Keepalived processes within the {{ $labels.cluster }} cluster are down." |
| {% endraw %} |
| {%- endif %} |