blob: 153705cbed6f144f4601ca3324e7717a7584e264 [file] [log] [blame]
{%- from "linux/map.jinja" import monitoring with context %}
server:
alert:
SystemCpuIdleTooLow:
{%- set cpu_idle_threshold = monitoring.cpu_idle_percentage.warn|float %}
if: avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) < {{ cpu_idle_threshold }}
{% raw %}
labels:
severity: warning
service: system
annotations:
summary: 'Idle CPU usage too low on {{ $labels.host }}'
description: 'The average idle CPU usage is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ cpu_idle_threshold}}%).'
SystemDiskSpaceTooLow:
if: 'predict_linear(disk_free[1h], 8*3600) < 0'
{% raw %}
for: 15m
labels:
severity: warning
service: system
annotations:
summary: 'Free space for {{ $labels.path }} too low on {{ $labels.host }}'
description: 'The disk partition ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
{% endraw %}
SystemFreeOpenFilesTooLow:
if: 'predict_linear(linux_sysctl_fs_file_nr[1h], 8*3600) > linux_sysctl_fs_file_max'
{% raw %}
labels:
severity: warning
service: system
annotations:
summary: 'Free open files for {{ $labels.path }} too low on {{ $labels.host }}'
description: 'Host {{ $labels.host }}) will run out of free open files in less than 8 hours.'
{% endraw %}
SystemDiskErrors:
if: 'increase(hdd_errors_total[5m]) > 0'
{% raw %}
labels:
severity: critical
service: system
annotations:
summary: 'Disk {{ $labels.device }} is failing'
description: 'The disk ({{ $labels.device }}) is reporting errors on {{ $labels.host }}.'
{% endraw %}
SystemDiskSpaceFull:
if: 'disk_used_percent >= 99 and disk_inodes_total > 0'
{% raw %}
labels:
severity: critical
service: system
annotations:
summary: 'Disk partition {{ $labels.path }} full on {{ $labels.host }}'
description: 'The disk partition ({{ $labels.path }}) is used at {{ $value }}% on {{ $labels.host }}.'
{% endraw %}
SystemDiskInodesTooLow:
if: 'predict_linear(disk_inodes_free[1h], 8*3600) < 0'
{% raw %}
for: 15m
labels:
severity: warning
service: system
annotations:
summary: 'Free inodes for {{ $labels.path }} too low on {{ $labels.host }}'
description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
{% endraw %}
SystemDiskInodesFull:
if: 'disk_inodes_used / disk_inodes_total >= 0.99'
{% raw %}
labels:
severity: critical
service: system
annotations:
summary: 'Inodes for {{ $labels.path }} full on {{ $labels.host }}'
description: 'The disk inodes ({{ $labels.path }}) are used at {{ $value }}% on {{ $labels.host }}.'
{% endraw %}
SystemMemoryAvailableLow:
{%- set mem_avail_warn_threshold = monitoring.free_memory_percentage.warn|float %}
if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_warn_threshold }}
{% raw %}
labels:
severity: warning
service: system
annotations:
summary: 'Free memory low on {{ $labels.host }}'
description: 'The percentage of free memory is low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_warn_threshold }}%).'
SystemMemoryAvailableTooLow:
{%- set mem_avail_crit_threshold = monitoring.free_memory_percentage.crit|float %}
if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_crit_threshold }}
{% raw %}
labels:
severity: critical
service: system
annotations:
summary: 'Free memory too low on {{ $labels.host }}'
description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_crit_threshold }}%).'
SystemLoad5TooHigh:
if: system_load5 / system_n_cpus > {{ monitoring.load_5.warn }}
{% raw %}
labels:
severity: warning
service: system
annotations:
summary: 'High system load (5m) on {{ $labels.host }}'
description: 'The 5-minutes system load is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ monitoring.load_5.warn }}).'
SystemRxPacketsDroppedTooHigh:
{%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_rate.warn %}
if: rate(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
{% raw %}
labels:
severity: critical
service: system
annotations:
summary: 'Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
description: 'The rate of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_rx_dropped_threshold }}/sec)'
SystemTxPacketsDroppedTooHigh:
{%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_rate.warn %}
if: rate(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
{% raw %}
labels:
severity: critical
service: system
annotations:
summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
description: 'The rate of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_tx_dropped_threshold }}/sec)'
SystemSwapIn:
{%- set swap_in_threshold = monitoring.swap_in_rate.warn %}
if: rate(swap_in[2m]) > {{ swap_in_threshold }}
{% raw %}
labels:
severity: warning
service: system
annotations:
summary: 'Swap input throughput too high on {{ $labels.host }}'
description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }}b/s).'
SystemSwapOut:
{%- set swap_out_threshold = monitoring.swap_out_rate.warn %}
if: rate(swap_out[2m]) > {{ swap_out_threshold }}
{% raw %}
labels:
severity: warning
service: system
annotations:
summary: 'Swap output throughput too high on {{ $labels.host }}'
description: 'The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_out_threshold }}b/s).'
{%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
BondInterfaceDown:
if: 'bond_status < 1'
{% raw %}
labels:
severity: critical
service: system
annotations:
summary: 'Bond status interface {{ $labels.bond }} is DOWN on {{ $labels.host }}'
description: 'The bond interface ({{ $labels.bond }) has all ifaces in a down state on {{ $labels.host }}.'
{% endraw %}
BondSlaveInterfaceStatus:
if: 'bond_slave_status < 1'
{% raw %}
labels:
severity: warning
service: system
annotations:
summary: 'Bond slave interface {{ $labels.interface }} is DOWN on {{ $labels.host }} for {{ $labels.bond }}'
description: 'The bond slave interface ({{ $labels.interface }) is in DOWN state for {{ $labels.bond }} on {{ $labels.host }}.'
{% endraw %}
{%- endif %}