Merge "Add I/O wait-related alerts"

commit: ac85c4fe780deef44efe89fd62103b70915553d8 [log] [tgz]
author: mcp-jenkins <mcp-jenkins@mirantis.com> Fri Apr 02 12:50:00 2021 +0000
committer: Gerrit Code Review <gerrit@cffb16fd602b> Fri Apr 02 12:50:00 2021 +0000
tree: 5317ab9195a95cfa5a862439c534513075e0ba97
parent: 559bbc087a36dbf9f456acb8777b28c06c589de9 [diff]
parent: 608c6018e24b9c066cb6ffb133e5a656fa868abd [diff]
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index a387b16..fac53c2 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml

@@ -1,6 +1,28 @@
 {%- from "linux/map.jinja" import monitoring, network with context %}
 server:
   alert:
+    {%- raw %}
+    SystemCpuIoWaitWarning:
+      if: >-
+        cpu_usage_iowait > 40
+      for: 10m
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "CPU waited for I/O 40% of time"
+        description: "The CPU on the {{ $labels.host }} node spent 40% of time waiting for I/O."
+    SystemCpuIoWaitCritical:
+      if: >-
+        cpu_usage_iowait > 50
+      for: 10m
+      labels:
+        severity: critical
+        service: system
+      annotations:
+        summary: "CPU waited for I/O 50% of time"
+        description: "The CPU on the {{ $labels.host }} node spent 50% of time waiting for I/O."
+    {%- endraw %}
     {%- set cpu_steal_warn = monitoring.cpu_steal_percentage.warn|float %}
     {%- set cpu_steal_crit = monitoring.cpu_steal_percentage.crit|float %}
     SystemCpuStealTimeWarning:
@@ -126,6 +148,42 @@
       annotations:
         summary: "Disk {{ $labels.device }} is failing"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
+    SystemDiskBacklogWarning:
+      if: >-
+        increase(diskio_weighted_io_time[10m]) > 2000
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "Disk {{ $labels.name }} requests waited 2 seconds"
+        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 2 seconds on the device during the last 10 minutes."
+    SystemDiskBacklogCritical:
+      if: >-
+        increase(diskio_weighted_io_time[10m]) > 5000
+      labels:
+        severity: critical
+        service: system
+      annotations:
+        summary: "Disk {{ $labels.name }} requests waited 5 seconds"
+        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 5 seconds on the device during the last 10 minutes."
+    SystemDiskRequestQueuedWarning:
+      if: >-
+        increase(diskio_io_time[10m]) > 0.9 * 10 * 60 * 1000
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "Disk {{ $labels.name }} requests were queued for 90% of time"
+        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 90% of the device time during the last 10 minutes."
+    SystemDiskRequestQueuedCritical:
+      if: >-
+        increase(diskio_io_time[10m]) > 0.98 * 10 * 60 * 1000
+      labels:
+        severity: critical
+        service: system
+      annotations:
+        summary: "Disk {{ $labels.name }} requests were queued for 98% of time"
+        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 98% of the device time during the last 10 minutes."
     SystemMemoryFullWarning:
       if: >-
         mem_used_percent > 90 and mem_available < 8 * 2^30
commit	ac85c4fe780deef44efe89fd62103b70915553d8	[log] [tgz]
author	mcp-jenkins <mcp-jenkins@mirantis.com>	Fri Apr 02 12:50:00 2021 +0000
committer	Gerrit Code Review <gerrit@cffb16fd602b>	Fri Apr 02 12:50:00 2021 +0000
tree	5317ab9195a95cfa5a862439c534513075e0ba97
parent	559bbc087a36dbf9f456acb8777b28c06c589de9 [diff]
parent	608c6018e24b9c066cb6ffb133e5a656fa868abd [diff]