Rework disk backlog alerts
- change backlog meaning and thresholds
backlog as concurrency level
(how many disk requests are currently
being served or are queued)
- filter physical devices only
LVMs (dm-), physical volumes (vd-)
misrepresent physical disk backlog
Change-Id: Idaf8f3f8f6e6938f73ad7ca88f12429475ec2194
Related-PROD: PROD-36296
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index 9d29c14..5fe2f05 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -150,25 +150,28 @@
description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
SystemDiskBacklogWarning:
if: >-
- increase(diskio_weighted_io_time[10m]) > 2000
+ rate(diskio_weighted_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 10
+ for: 10m
labels:
severity: warning
service: system
annotations:
- summary: "Disk {{ $labels.name }} requests waited 2 seconds"
- description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 2 seconds on the device during the last 10 minutes."
+ summary: "Disk {{ $labels.name }} backlog warning"
+ description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node exceeded concurrency level of 10 during the last 10 minutes."
SystemDiskBacklogCritical:
if: >-
- increase(diskio_weighted_io_time[10m]) > 5000
+ rate(diskio_weighted_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 20
+ for: 10m
labels:
severity: critical
service: system
annotations:
- summary: "Disk {{ $labels.name }} requests waited 5 seconds"
- description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 5 seconds on the device during the last 10 minutes."
+ summary: "Disk {{ $labels.name }} backlog critical"
+ description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node exceeded concurrency level of 20 during the last 10 minutes."
SystemDiskRequestQueuedWarning:
if: >-
- increase(diskio_io_time[10m]) > 0.9 * 10 * 60 * 1000
+ rate(diskio_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 0.9
+ for: 10m
labels:
severity: warning
service: system
@@ -177,7 +180,8 @@
description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 90% of the device time during the last 10 minutes."
SystemDiskRequestQueuedCritical:
if: >-
- increase(diskio_io_time[10m]) > 0.98 * 10 * 60 * 1000
+ rate(diskio_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 0.98
+ for: 10m
labels:
severity: critical
service: system