| {%- from "linux/map.jinja" import monitoring, network with context %} |
| server: |
| alert: |
| {%- set cpu_steal_warn = monitoring.cpu_steal_percentage.warn|float %} |
| {%- set cpu_steal_crit = monitoring.cpu_steal_percentage.crit|float %} |
| SystemCpuStealTimeWarning: |
| if: >- |
| cpu_usage_steal > {{ cpu_steal_warn }} |
| for: 5m |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "CPU steal time warning" |
| description: "The CPU steal time was above {{ cpu_steal_warn }}% on the {%- raw %} {{ $labels.host }}{%- endraw %} node for 5 minutes." |
| SystemCpuStealTimeCritical: |
| if: >- |
| cpu_usage_steal > {{ cpu_steal_crit }} |
| for: 5m |
| labels: |
| severity: critical |
| service: system |
| annotations: |
| summary: "CPU steal time critical" |
| description: "The CPU steal time was above {{ cpu_steal_crit }}% on the {%- raw %} {{ $labels.host }} node for 5 minutes." |
| SystemCpuFullWarning: |
| {%- endraw %} |
| {%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn|float %} |
| if: >- |
| 100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }} |
| {% raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage" |
| description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for 2 minutes." |
| SystemLoadTooHighWarning: |
| {%- endraw %} |
| {%- set load_threshold = monitoring.system_load_threshold.warn|float %} |
| if: >- |
| system_load5 / system_n_cpus > {{ load_threshold }} |
| {%- raw %} |
| for: 5m |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}" |
| description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes." |
| SystemLoadTooHighCritical: |
| {%- endraw %} |
| {%- set load_threshold = monitoring.system_load_threshold.crit|float %} |
| if: >- |
| system_load5 / system_n_cpus > {{ load_threshold }} |
| {%- raw %} |
| for: 5m |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}" |
| description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes." |
| SystemDiskFullWarning: |
| {%- endraw %} |
| {%- set disk_threshold = monitoring.disk_usage_percentage.warn|float %} |
| if: >- |
| disk_used_percent >= {{ disk_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full" |
| description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes." |
| SystemDiskFullMajor: |
| {%- endraw %} |
| {%- set disk_threshold = monitoring.disk_usage_percentage.major|float %} |
| if: >- |
| disk_used_percent >= {{ disk_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: major |
| service: system |
| annotations: |
| summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full" |
| description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes." |
| SystemDiskInodesFullWarning: |
| {%- endraw %} |
| {%- set inodes_threshold = monitoring.inodes_usage_percentage.warn|float %} |
| if: >- |
| 100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }} |
| for: 2m |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used" |
| description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes." |
| SystemDiskInodesFullMajor: |
| {%- endraw %} |
| {%- set inodes_threshold = monitoring.inodes_usage_percentage.major|float %} |
| if: >- |
| 100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }} |
| for: 2m |
| labels: |
| severity: major |
| service: system |
| annotations: |
| summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used" |
| description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes." |
| SystemDiskErrorsTooHigh: |
| if: >- |
| increase(hdd_errors_total[1m]) > 0 |
| for: 5m |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "Disk {{ $labels.device }} is failing" |
| description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes." |
| {%- endraw %} |
| {%- if grains.get('virtual', None) == 'physical' %} |
| {%- raw %} |
| SystemSMARTDiskUDMACrcErrorsTooHigh: |
| if: >- |
| increase(smart_device_udma_crc_errors[1m]) > 0 |
| for: 5m |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "The {{ $labels.device }} disk has UDMA CRC errors" |
| description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting UDMA CRC errors for 5 minutes." |
| SystemSMARTDiskHealthStatus: |
| if: >- |
| smart_device_health_ok == 0 |
| for: 1m |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "The {{ $labels.device }} disk has bad health" |
| description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting a bad health status for 1 minute." |
| SystemSMARTDiskReadErrorRate: |
| if: >- |
| increase(smart_device_read_error_rate[1m]) > 0 |
| for: 5m |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "The {{ $labels.device }} disk has read errors" |
| description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting an increased read error rate for 5 minutes." |
| SystemSMARTDiskSeekErrorRate: |
| if: >- |
| increase(smart_device_seek_error_rate[1m]) > 0 |
| for: 5m |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "The {{ $labels.device }} disk has seek errors" |
| description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting an increased seek error rate for 5 minutes." |
| SystemSMARTDiskTemperatureHigh: |
| if: >- |
| smart_device_temp_c >= 60 |
| for: 5m |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "The {{ $labels.device }} disk temperature is high" |
| description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has a temperature of {{ $value }}C for 5 minutes." |
| SystemSMARTDiskReallocatedSectorsCount: |
| if: >- |
| smart_attribute_raw_value{name="Reallocated_Sector_Ct"} > 10 |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "The {{ $labels.device }} disk has reallocated sectors" |
| description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has reallocated {{ $value }} sectors." |
| SystemSMARTDiskCurrentPendingSectors: |
| if: >- |
| smart_attribute_raw_value{name="Current_Pending_Sector"} > 0 |
| labels: |
| severity: major |
| service: system |
| annotations: |
| summary: "The {{ $labels.device }} disk has current pending sectors" |
| description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'current pending' sectors." |
| SystemSMARTDiskReportedUncorrectableErrors: |
| if: >- |
| smart_attribute_raw_value{name="Reported_Uncorrect"} > 0 |
| labels: |
| severity: major |
| service: system |
| annotations: |
| summary: "The {{ $labels.device }} disk has reported uncorrectable errors" |
| description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'reported uncorrectable' errors." |
| SystemSMARTDiskOfflineUncorrectableSectors: |
| if: >- |
| smart_attribute_raw_value{name="Offline_Uncorrectable"} > 0 |
| labels: |
| severity: major |
| service: system |
| annotations: |
| summary: "The {{ $labels.device }} disk has offline uncorrectable sectors" |
| description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'offline uncorrectable' sectors." |
| SystemSMARTDiskEndToEndError: |
| if: >- |
| smart_attribute_raw_value{name="End-to-End_Error"} > 0 |
| labels: |
| severity: major |
| service: system |
| annotations: |
| summary: "The {{ $labels.device }} disk has end-to-end errors" |
| description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'end-to-end' errors." |
| {%- endraw %} |
| {%- endif %} |
| {%- raw %} |
| SystemMemoryFullWarning: |
| if: >- |
| mem_used_percent > 90 and mem_available < 8 * 2^30 |
| for: 2m |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "More than 90% of memory is used or less than 8 GB of memory is available" |
| description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes." |
| SystemMemoryFullMajor: |
| if: >- |
| mem_used_percent > 95 and mem_available < 4 * 2^30 |
| for: 2m |
| labels: |
| severity: major |
| service: system |
| annotations: |
| summary: "More than 95% of memory is used or less than 4 GB of memory is available" |
| description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes." |
| SystemRxPacketsDroppedTooHigh: |
| {%- endraw %} |
| {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %} |
| if: >- |
| increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }} unless on (host,interface) bond_slave_active == 0 |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped" |
| description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute." |
| SystemTxPacketsDroppedTooHigh: |
| {%- endraw %} |
| {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %} |
| if: >- |
| increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }} |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped" |
| description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute." |
| CronProcessDown: |
| if: >- |
| procstat_running{process_name="cron"} == 0 |
| labels: |
| severity: critical |
| service: system |
| annotations: |
| summary: "Cron process is down" |
| description: "The cron process on the {{ $labels.host }} node is down." |
| SshdProcessDown: |
| if: >- |
| procstat_running{process_name="sshd"} == 0 |
| labels: |
| severity: critical |
| service: system |
| annotations: |
| summary: "SSH process is down" |
| description: "The SSH process on the {{ $labels.host }} node is down." |
| SshFailedLoginsTooHigh: |
| {%- endraw %} |
| {%- set threshold = monitoring.failed_auths_threshold.warn %} |
| if: >- |
| increase(failed_logins_total[5m]) > {{ threshold }} |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "{{ threshold }}{%- raw %} failed SSH logins" |
| description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes." |
| PacketsDroppedByCpuWarning: |
| if: >- |
| floor(increase(nstat_packet_drop[10m])) > 0 |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "Increased number of CPU dropped packets" |
| description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 10 minutes." |
| PacketsDroppedByCpuMinor: |
| if: >- |
| floor(increase(nstat_packet_drop[10m])) > 100 |
| labels: |
| severity: minor |
| service: system |
| annotations: |
| summary: "CPU dropped more than 100 packets" |
| description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 10 minutes." |
| NetdevBudgetRanOutsWarning: |
| {%- endraw %} |
| {%- set squeeze_rate_threshold = monitoring.netdev_budget_squeeze_rate %} |
| if: >- |
| max(rate(nstat_time_squeeze[5m])) without (cpu) > {{ squeeze_rate_threshold }} |
| for: 7m |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "CPU terminated {{ squeeze_rate_threshold }}{%- raw %} net_rx_action loops per second" |
| description: "The rate of net_rx_action loops terminations on the {{ $labels.host }} node is {{ $value }} per second during the last 7 minutes. Modify the net.core.netdev_budget and net.core.netdev_budget_usecs kernel parameters." |
| {%- endraw %} |
| {%- if network.bridge == 'openvswitch' %} |
| {%- raw %} |
| ProcessOVSVswitchdMemoryWarning: |
| if: procstat_memory_vms{process_name="ovs-vswitchd"} / on(host) mem_total > 0.2 |
| for: 5m |
| labels: |
| severity: warning |
| service: ovs |
| annotations: |
| summary: "ovs-vswitchd takes more than 20% of system memory" |
| description: "ovs-vswitchd takes more than 20% of system memory" |
| ProcessOVSVswitchdMemoryCritical: |
| if: procstat_memory_vms{process_name="ovs-vswitchd"} / on(host) mem_total > 0.3 |
| for: 5m |
| labels: |
| severity: critical |
| service: ovs |
| annotations: |
| summary: "ovs-vswitchd takes more than 30% of system memory" |
| description: "ovs-vswitchd takes more than 30% of system memory" |
| {%- endraw %} |
| {%- endif %} |
| {%- set bond_interfaces = [] %} |
| {%- for interface_name, interface in network.interface.items() %} |
| {%- if interface.type == 'bond' and interface.enabled == True %} |
| {%- do bond_interfaces.append(interface_name) %} |
| {%- endif %} |
| {%- endfor %} |
| {%- if bond_interfaces|length > 0 %} |
| {%- raw %} |
| BondInterfaceDown: |
| if: >- |
| bond_status < 1 |
| labels: |
| severity: critical |
| service: system |
| annotations: |
| summary: "{{ $labels.bond }} bond interface is down" |
| description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down." |
| BondInterfaceSlaveDown: |
| if: >- |
| bond_slave_status < 1 |
| labels: |
| severity: warning |
| service: system |
| annotations: |
| summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down" |
| description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down." |
| BondInterfaceSlaveDownMajor: |
| if: >- |
| sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status) |
| labels: |
| severity: major |
| service: system |
| annotations: |
| summary: "50% of bond interface slaves {{ $labels.bond }} are down" |
| description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down." |
| BondInterfaceSingleSlave: |
| if: >- |
| count(bond_slave_status) by (bond,host) == 1 |
| labels: |
| severity: major |
| service: system |
| annotations: |
| summary: "The {{ $labels.bond }} bond interface has only one slave" |
| description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has only one slave." |
| {%- endraw %} |
| {%- endif %} |