blob: fac53c209c3da0b4bcf901142598dab60f3d3fa3 [file] [log] [blame]
{%- from "linux/map.jinja" import monitoring, network with context %}
server:
alert:
{%- raw %}
SystemCpuIoWaitWarning:
if: >-
cpu_usage_iowait > 40
for: 10m
labels:
severity: warning
service: system
annotations:
summary: "CPU waited for I/O 40% of time"
description: "The CPU on the {{ $labels.host }} node spent 40% of time waiting for I/O."
SystemCpuIoWaitCritical:
if: >-
cpu_usage_iowait > 50
for: 10m
labels:
severity: critical
service: system
annotations:
summary: "CPU waited for I/O 50% of time"
description: "The CPU on the {{ $labels.host }} node spent 50% of time waiting for I/O."
{%- endraw %}
{%- set cpu_steal_warn = monitoring.cpu_steal_percentage.warn|float %}
{%- set cpu_steal_crit = monitoring.cpu_steal_percentage.crit|float %}
SystemCpuStealTimeWarning:
if: >-
cpu_usage_steal > {{ cpu_steal_warn }}
for: 5m
labels:
severity: warning
service: system
annotations:
summary: "CPU steal time warning"
description: "The CPU steal time was above {{ cpu_steal_warn }}% on the {%- raw %} {{ $labels.host }}{%- endraw %} node for 5 minutes."
SystemCpuStealTimeCritical:
if: >-
cpu_usage_steal > {{ cpu_steal_crit }}
for: 5m
labels:
severity: critical
service: system
annotations:
summary: "CPU steal time critical"
description: "The CPU steal time was above {{ cpu_steal_crit }}% on the {%- raw %} {{ $labels.host }} node for 5 minutes."
SystemCpuFullWarning:
{%- endraw %}
{%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn|float %}
if: >-
100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}
{% raw %}
for: 2m
labels:
severity: warning
service: system
annotations:
summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage"
description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for 2 minutes."
{%- endraw %}
{% if not ([
salt['pillar.get']('nova:compute:enabled', False)
]|select('equalto', True)|list) %} {# glorified `not any(<iterable>)` condition #}
SystemLoadTooHighWarning:
{%- set load_threshold = monitoring.system_load_threshold.warn|float %}
if: >-
system_load15{host!~".*cmp[0-9]+"} / system_n_cpus > {{ load_threshold }}
{%- raw %}
for: 5m
labels:
severity: warning
service: system
annotations:
summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
{%- endraw %}
SystemLoadTooHighCritical:
{%- set load_threshold = monitoring.system_load_threshold.crit|float %}
if: >-
system_load15{host!~".*cmp[0-9]+"} / system_n_cpus > {{ load_threshold }}
{%- raw %}
for: 5m
labels:
severity: critical
service: system
annotations:
summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
{%- endraw %}
{% endif %}
SystemDiskFullWarning:
{%- set disk_threshold = monitoring.disk_usage_percentage.warn|float %}
if: >-
disk_used_percent{mode!="ro"} >= {{ disk_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: system
annotations:
summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
SystemDiskFullMajor:
{%- endraw %}
{%- set disk_threshold = monitoring.disk_usage_percentage.major|float %}
if: >-
disk_used_percent{mode!="ro"} >= {{ disk_threshold }}
{%- raw %}
for: 2m
labels:
severity: major
service: system
annotations:
summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
SystemDiskInodesFullWarning:
{%- endraw %}
{%- set inodes_threshold = monitoring.inodes_usage_percentage.warn|float %}
if: >-
100 * disk_inodes_used{mode!="ro"} / disk_inodes_total{mode!="ro"} >= {{ inodes_threshold }}
for: 2m
labels:
severity: warning
service: system
annotations:
summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
SystemDiskInodesFullMajor:
{%- endraw %}
{%- set inodes_threshold = monitoring.inodes_usage_percentage.major|float %}
if: >-
100 * disk_inodes_used{mode!="ro"} / disk_inodes_total{mode!="ro"} >= {{ inodes_threshold }}
for: 2m
labels:
severity: major
service: system
annotations:
summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
SystemDiskErrorsTooHigh:
if: >-
increase(hdd_errors_total[1m]) > 0
for: 5m
labels:
severity: warning
service: system
annotations:
summary: "Disk {{ $labels.device }} is failing"
description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
SystemDiskBacklogWarning:
if: >-
increase(diskio_weighted_io_time[10m]) > 2000
labels:
severity: warning
service: system
annotations:
summary: "Disk {{ $labels.name }} requests waited 2 seconds"
description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 2 seconds on the device during the last 10 minutes."
SystemDiskBacklogCritical:
if: >-
increase(diskio_weighted_io_time[10m]) > 5000
labels:
severity: critical
service: system
annotations:
summary: "Disk {{ $labels.name }} requests waited 5 seconds"
description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 5 seconds on the device during the last 10 minutes."
SystemDiskRequestQueuedWarning:
if: >-
increase(diskio_io_time[10m]) > 0.9 * 10 * 60 * 1000
labels:
severity: warning
service: system
annotations:
summary: "Disk {{ $labels.name }} requests were queued for 90% of time"
description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 90% of the device time during the last 10 minutes."
SystemDiskRequestQueuedCritical:
if: >-
increase(diskio_io_time[10m]) > 0.98 * 10 * 60 * 1000
labels:
severity: critical
service: system
annotations:
summary: "Disk {{ $labels.name }} requests were queued for 98% of time"
description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 98% of the device time during the last 10 minutes."
SystemMemoryFullWarning:
if: >-
mem_used_percent > 90 and mem_available < 8 * 2^30
for: 2m
labels:
severity: warning
service: system
annotations:
summary: "More than 90% of memory is used and less than 8 GB of memory is available"
description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
SystemMemoryFullMajor:
if: >-
mem_used_percent > 95 and mem_available < 4 * 2^30
for: 2m
labels:
severity: major
service: system
annotations:
summary: "More than 95% of memory is used and less than 4 GB of memory is available"
description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
SystemRxPacketsDroppedTooHigh:
{%- endraw %}
{%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
if: >-
increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }} unless on (host,interface) bond_slave_active == 0
labels:
severity: warning
service: system
annotations:
summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped"
description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
SystemTxPacketsDroppedTooHigh:
{%- endraw %}
{%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %}
if: >-
increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
labels:
severity: warning
service: system
annotations:
summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped"
description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
CronProcessDown:
if: >-
procstat_running{process_name="cron"} == 0
for: 2m
labels:
severity: critical
service: system
annotations:
summary: "Cron process is down"
description: "The cron process on the {{ $labels.host }} node is down."
SshdProcessDown:
if: >-
procstat_running{process_name="sshd"} == 0
for: 2m
labels:
severity: critical
service: system
annotations:
summary: "SSH process is down"
description: "The SSH process on the {{ $labels.host }} node is down."
SshFailedLoginsTooHigh:
{%- endraw %}
{%- set threshold = monitoring.failed_auths_threshold.warn %}
if: >-
increase(failed_logins_total[5m]) > {{ threshold }}
labels:
severity: warning
service: system
annotations:
summary: "{{ threshold }}{%- raw %} failed SSH logins"
description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
PacketsDroppedByCpuWarning:
if: >-
floor(increase(nstat_packet_drop[10m])) > 0
labels:
severity: warning
service: system
annotations:
summary: "Increased number of CPU dropped packets"
description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 10 minutes."
PacketsDroppedByCpuMinor:
if: >-
floor(increase(nstat_packet_drop[10m])) > 100
labels:
severity: minor
service: system
annotations:
summary: "CPU dropped more than 100 packets"
description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 10 minutes."
NetdevBudgetRanOutsWarning:
{%- endraw %}
{%- set squeeze_rate_threshold = monitoring.netdev_budget_squeeze_rate %}
if: >-
max(rate(nstat_time_squeeze[5m])) without (cpu) > {{ squeeze_rate_threshold }}
for: 7m
labels:
severity: warning
service: system
annotations:
summary: "CPU terminated {{ squeeze_rate_threshold }}{%- raw %} net_rx_action loops per second"
description: "The rate of net_rx_action loops terminations on the {{ $labels.host }} node is {{ $value }} per second during the last 7 minutes. Modify the net.core.netdev_budget and net.core.netdev_budget_usecs kernel parameters."
{%- endraw %}
{%- if network.bridge == 'openvswitch' %}
{%- raw %}
ProcessOVSVswitchdMemoryWarning:
if: procstat_memory_vms{process_name="ovs-vswitchd"} / on(host) mem_total > 0.2
for: 5m
labels:
severity: warning
service: ovs
annotations:
summary: "ovs-vswitchd takes more than 20% of system memory"
description: "ovs-vswitchd takes more than 20% of system memory"
ProcessOVSVswitchdMemoryCritical:
if: procstat_memory_vms{process_name="ovs-vswitchd"} / on(host) mem_total > 0.3
for: 5m
labels:
severity: critical
service: ovs
annotations:
summary: "ovs-vswitchd takes more than 30% of system memory"
description: "ovs-vswitchd takes more than 30% of system memory"
{%- endraw %}
{%- endif %}
{%- set bond_interfaces = [] %}
{%- for interface_name, interface in network.interface.items() %}
{%- if interface.type == 'bond' and interface.enabled == True %}
{%- do bond_interfaces.append(interface_name) %}
{%- endif %}
{%- endfor %}
{%- if bond_interfaces|length > 0 %}
{%- raw %}
BondInterfaceDown:
if: >-
bond_status < 1
labels:
severity: critical
service: system
annotations:
summary: "{{ $labels.bond }} bond interface is down"
description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down."
BondInterfaceSlaveDown:
if: >-
bond_slave_status < 1
labels:
severity: warning
service: system
annotations:
summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down"
description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
BondInterfaceSlaveDownMajor:
if: >-
sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
labels:
severity: major
service: system
annotations:
summary: "50% of bond interface slaves {{ $labels.bond }} are down"
description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
BondInterfaceSingleSlave:
if: >-
count(bond_slave_status) by (bond,host) == 1
labels:
severity: major
service: system
annotations:
summary: "The {{ $labels.bond }} bond interface has only one slave"
description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has only one slave."
{%- endraw %}
{%- endif %}