Bartosz Kupidura | 83afdab | 2017-06-30 09:44:22 +0200 | [diff] [blame] | 1 | {%- from "keepalived/map.jinja" import cluster with context %} |
| 2 | {%- if cluster.get('enabled', False) %} |
| 3 | server: |
| 4 | alert: |
Simon Pasquier | 31d3a56 | 2017-07-24 15:34:42 +0200 | [diff] [blame] | 5 | KeepalivedProcessDown: |
Bartosz Kupidura | 83afdab | 2017-06-30 09:44:22 +0200 | [diff] [blame] | 6 | if: >- |
| 7 | procstat_running{process_name="keepalived"} == 0 |
| 8 | {% raw %} |
Ksawery Dziekoński | 081224f | 2020-08-04 14:50:19 +0200 | [diff] [blame] | 9 | for: 2m |
Bartosz Kupidura | 83afdab | 2017-06-30 09:44:22 +0200 | [diff] [blame] | 10 | labels: |
Mateusz Matuszkowiak | 7b1285a | 2018-04-23 13:26:40 +0200 | [diff] [blame] | 11 | severity: major |
Bartosz Kupidura | 83afdab | 2017-06-30 09:44:22 +0200 | [diff] [blame] | 12 | service: keepalived |
| 13 | annotations: |
Michal Kobus | fd3d880 | 2018-06-07 18:22:44 +0200 | [diff] [blame] | 14 | summary: "Keepalived process is down" |
| 15 | description: "The Keepalived process on the {{ $labels.host }} node is down." |
Bartosz Kupidura | 83afdab | 2017-06-30 09:44:22 +0200 | [diff] [blame] | 16 | {% endraw %} |
Michal Kobus | f49aba0 | 2018-11-01 13:24:21 +0100 | [diff] [blame] | 17 | KeepalivedProcessNotResponsive: |
| 18 | if: >- |
| 19 | keepalived_up == 0 |
| 20 | {% raw %} |
| 21 | labels: |
| 22 | severity: major |
| 23 | service: keepalived |
| 24 | annotations: |
| 25 | summary: "Keepalived process is not responding" |
| 26 | description: "The Keepalived process on the {{ $labels.host }} node is not responding." |
| 27 | {% endraw %} |
Mateusz Matuszkowiak | 7b1285a | 2018-04-23 13:26:40 +0200 | [diff] [blame] | 28 | KeepalivedFailedState: |
Mateusz Matuszkowiak | 9762368 | 2018-04-12 16:22:49 +0200 | [diff] [blame] | 29 | if: >- |
| 30 | keepalived_state == 0 |
| 31 | {% raw %} |
| 32 | labels: |
Mateusz Matuszkowiak | 7b1285a | 2018-04-23 13:26:40 +0200 | [diff] [blame] | 33 | severity: minor |
Mateusz Matuszkowiak | 9762368 | 2018-04-12 16:22:49 +0200 | [diff] [blame] | 34 | service: keepalived |
| 35 | annotations: |
Michal Kobus | 8e4ab4e | 2018-07-20 14:47:28 +0200 | [diff] [blame] | 36 | summary: "Keepalived VRRP state is FAILED" |
Michal Kobus | f49aba0 | 2018-11-01 13:24:21 +0100 | [diff] [blame] | 37 | description: "The Keepalived VRRP {{ $labels.name }} is in the FAILED state on the {{ $labels.host }} node." |
| 38 | {% endraw %} |
| 39 | KeepalivedUnknownState: |
| 40 | if: >- |
| 41 | keepalived_state == -1 |
| 42 | {% raw %} |
| 43 | labels: |
| 44 | severity: minor |
| 45 | service: keepalived |
| 46 | annotations: |
| 47 | summary: "Keepalived VRRP state is UNKNOWN" |
| 48 | description: "The Keepalived VRRP {{ $labels.name }} is in the UNKNOWN state on the {{ $labels.host }} node." |
Mateusz Matuszkowiak | 9762368 | 2018-04-12 16:22:49 +0200 | [diff] [blame] | 49 | {% endraw %} |
Mateusz Matuszkowiak | 270d5ea | 2018-04-23 16:13:37 +0200 | [diff] [blame] | 50 | KeepalivedMultipleIPAddr: |
Mateusz Matuszkowiak | a186230 | 2018-04-24 12:04:06 +0200 | [diff] [blame] | 51 | if: >- |
| 52 | count(ipcheck_assigned) by (ip) > 1 |
Mateusz Matuszkowiak | 270d5ea | 2018-04-23 16:13:37 +0200 | [diff] [blame] | 53 | {% raw %} |
| 54 | for: 2m |
| 55 | labels: |
| 56 | severity: major |
| 57 | service: keepalived |
| 58 | annotations: |
Michal Kobus | fd3d880 | 2018-06-07 18:22:44 +0200 | [diff] [blame] | 59 | summary: "Keepalived VIP is assigned more than once" |
| 60 | description: "The Keepalived {{ $labels.ip }} virtual IP is assigned more than once." |
Mateusz Matuszkowiak | 270d5ea | 2018-04-23 16:13:37 +0200 | [diff] [blame] | 61 | {% endraw %} |
Mateusz Matuszkowiak | a186230 | 2018-04-24 12:04:06 +0200 | [diff] [blame] | 62 | KeepalivedServiceOutage: |
| 63 | if: >- |
| 64 | count(label_replace(procstat_running{process_name="keepalived"}, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster) == count(label_replace(procstat_running{process_name="keepalived"} == 0, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster) |
| 65 | {% raw %} |
Ksawery Dziekoński | 081224f | 2020-08-04 14:50:19 +0200 | [diff] [blame] | 66 | for: 2m |
Mateusz Matuszkowiak | a186230 | 2018-04-24 12:04:06 +0200 | [diff] [blame] | 67 | labels: |
| 68 | severity: critical |
| 69 | service: keepalived |
| 70 | annotations: |
Michal Kobus | fd3d880 | 2018-06-07 18:22:44 +0200 | [diff] [blame] | 71 | summary: "Keepalived service outage" |
| 72 | description: "All Keepalived processes within the {{ $labels.cluster }} cluster are down." |
Mateusz Matuszkowiak | a186230 | 2018-04-24 12:04:06 +0200 | [diff] [blame] | 73 | {% endraw %} |
Bartosz Kupidura | 83afdab | 2017-06-30 09:44:22 +0200 | [diff] [blame] | 74 | {%- endif %} |