Rework disk backlog alerts - change backlog meaning and thresholds backlog as concurrency level (how many disk requests are currently being served or are queued) - filter physical devices only LVMs (dm-), physical volumes (vd-) misrepresent physical disk backlog Change-Id: Idaf8f3f8f6e6938f73ad7ca88f12429475ec2194 Related-PROD: PROD-36296

commit: eee7c774df7644b908c0e5ef4a36c954265b9833 [log] [tgz]
author: Michal Kobus <mkobus@mirantis.com> Wed Apr 14 15:49:25 2021 +0200
committer: Michal Kobus <mkobus@mirantis.com> Thu Apr 29 19:42:03 2021 +0200
tree: 0b116c8b94128bf9482cfb2fc13320d5dc5d7876
parent: 6d024b15ff32caf03681f7fa69fd8b80af3fdf05 [diff]
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index 9d29c14..5fe2f05 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml

@@ -150,25 +150,28 @@
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
     SystemDiskBacklogWarning:
       if: >-
-        increase(diskio_weighted_io_time[10m]) > 2000
+        rate(diskio_weighted_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 10
+      for: 10m
       labels:
         severity: warning
         service: system
       annotations:
-        summary: "Disk {{ $labels.name }} requests waited 2 seconds"
-        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 2 seconds on the device during the last 10 minutes."
+        summary: "Disk {{ $labels.name }} backlog warning"
+        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node exceeded concurrency level of 10 during the last 10 minutes."
     SystemDiskBacklogCritical:
       if: >-
-        increase(diskio_weighted_io_time[10m]) > 5000
+        rate(diskio_weighted_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 20
+      for: 10m
       labels:
         severity: critical
         service: system
       annotations:
-        summary: "Disk {{ $labels.name }} requests waited 5 seconds"
-        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 5 seconds on the device during the last 10 minutes."
+        summary: "Disk {{ $labels.name }} backlog critical"
+        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node exceeded concurrency level of 20 during the last 10 minutes."
     SystemDiskRequestQueuedWarning:
       if: >-
-        increase(diskio_io_time[10m]) > 0.9 * 10 * 60 * 1000
+        rate(diskio_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 0.9
+      for: 10m
       labels:
         severity: warning
         service: system
@@ -177,7 +180,8 @@
         description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 90% of the device time during the last 10 minutes."
     SystemDiskRequestQueuedCritical:
       if: >-
-        increase(diskio_io_time[10m]) > 0.98 * 10 * 60 * 1000
+        rate(diskio_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 0.98
+      for: 10m
       labels:
         severity: critical
         service: system
commit	eee7c774df7644b908c0e5ef4a36c954265b9833	[log] [tgz]
author	Michal Kobus <mkobus@mirantis.com>	Wed Apr 14 15:49:25 2021 +0200
committer	Michal Kobus <mkobus@mirantis.com>	Thu Apr 29 19:42:03 2021 +0200
tree	0b116c8b94128bf9482cfb2fc13320d5dc5d7876
parent	6d024b15ff32caf03681f7fa69fd8b80af3fdf05 [diff]