Merge "Add I/O wait-related alerts"
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index a387b16..fac53c2 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -1,6 +1,28 @@
{%- from "linux/map.jinja" import monitoring, network with context %}
server:
alert:
+ {%- raw %}
+ SystemCpuIoWaitWarning:
+ if: >-
+ cpu_usage_iowait > 40
+ for: 10m
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: "CPU waited for I/O 40% of time"
+ description: "The CPU on the {{ $labels.host }} node spent 40% of time waiting for I/O."
+ SystemCpuIoWaitCritical:
+ if: >-
+ cpu_usage_iowait > 50
+ for: 10m
+ labels:
+ severity: critical
+ service: system
+ annotations:
+ summary: "CPU waited for I/O 50% of time"
+ description: "The CPU on the {{ $labels.host }} node spent 50% of time waiting for I/O."
+ {%- endraw %}
{%- set cpu_steal_warn = monitoring.cpu_steal_percentage.warn|float %}
{%- set cpu_steal_crit = monitoring.cpu_steal_percentage.crit|float %}
SystemCpuStealTimeWarning:
@@ -126,6 +148,42 @@
annotations:
summary: "Disk {{ $labels.device }} is failing"
description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
+ SystemDiskBacklogWarning:
+ if: >-
+ increase(diskio_weighted_io_time[10m]) > 2000
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: "Disk {{ $labels.name }} requests waited 2 seconds"
+ description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 2 seconds on the device during the last 10 minutes."
+ SystemDiskBacklogCritical:
+ if: >-
+ increase(diskio_weighted_io_time[10m]) > 5000
+ labels:
+ severity: critical
+ service: system
+ annotations:
+ summary: "Disk {{ $labels.name }} requests waited 5 seconds"
+ description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 5 seconds on the device during the last 10 minutes."
+ SystemDiskRequestQueuedWarning:
+ if: >-
+ increase(diskio_io_time[10m]) > 0.9 * 10 * 60 * 1000
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: "Disk {{ $labels.name }} requests were queued for 90% of time"
+ description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 90% of the device time during the last 10 minutes."
+ SystemDiskRequestQueuedCritical:
+ if: >-
+ increase(diskio_io_time[10m]) > 0.98 * 10 * 60 * 1000
+ labels:
+ severity: critical
+ service: system
+ annotations:
+ summary: "Disk {{ $labels.name }} requests were queued for 98% of time"
+ description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 98% of the device time during the last 10 minutes."
SystemMemoryFullWarning:
if: >-
mem_used_percent > 90 and mem_available < 8 * 2^30