Merge "Add I/O wait-related alerts"
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index a387b16..fac53c2 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -1,6 +1,28 @@
 {%- from "linux/map.jinja" import monitoring, network with context %}
 server:
   alert:
+    {%- raw %}
+    SystemCpuIoWaitWarning:
+      if: >-
+        cpu_usage_iowait > 40
+      for: 10m
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "CPU waited for I/O 40% of time"
+        description: "The CPU on the {{ $labels.host }} node spent 40% of time waiting for I/O."
+    SystemCpuIoWaitCritical:
+      if: >-
+        cpu_usage_iowait > 50
+      for: 10m
+      labels:
+        severity: critical
+        service: system
+      annotations:
+        summary: "CPU waited for I/O 50% of time"
+        description: "The CPU on the {{ $labels.host }} node spent 50% of time waiting for I/O."
+    {%- endraw %}
     {%- set cpu_steal_warn = monitoring.cpu_steal_percentage.warn|float %}
     {%- set cpu_steal_crit = monitoring.cpu_steal_percentage.crit|float %}
     SystemCpuStealTimeWarning:
@@ -126,6 +148,42 @@
       annotations:
         summary: "Disk {{ $labels.device }} is failing"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
+    SystemDiskBacklogWarning:
+      if: >-
+        increase(diskio_weighted_io_time[10m]) > 2000
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "Disk {{ $labels.name }} requests waited 2 seconds"
+        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 2 seconds on the device during the last 10 minutes."
+    SystemDiskBacklogCritical:
+      if: >-
+        increase(diskio_weighted_io_time[10m]) > 5000
+      labels:
+        severity: critical
+        service: system
+      annotations:
+        summary: "Disk {{ $labels.name }} requests waited 5 seconds"
+        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 5 seconds on the device during the last 10 minutes."
+    SystemDiskRequestQueuedWarning:
+      if: >-
+        increase(diskio_io_time[10m]) > 0.9 * 10 * 60 * 1000
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "Disk {{ $labels.name }} requests were queued for 90% of time"
+        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 90% of the device time during the last 10 minutes."
+    SystemDiskRequestQueuedCritical:
+      if: >-
+        increase(diskio_io_time[10m]) > 0.98 * 10 * 60 * 1000
+      labels:
+        severity: critical
+        service: system
+      annotations:
+        summary: "Disk {{ $labels.name }} requests were queued for 98% of time"
+        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 98% of the device time during the last 10 minutes."
     SystemMemoryFullWarning:
       if: >-
         mem_used_percent > 90 and mem_available < 8 * 2^30