Rework disk backlog alerts

- change backlog meaning and thresholds
  backlog as concurrency level
  (how many disk requests are currently
   being served or are queued)
- filter physical devices only
  LVMs (dm-), physical volumes (vd-)
  misrepresent physical disk backlog

Change-Id: Idaf8f3f8f6e6938f73ad7ca88f12429475ec2194
Related-PROD: PROD-36296
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index 9d29c14..5fe2f05 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -150,25 +150,28 @@
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
     SystemDiskBacklogWarning:
       if: >-
-        increase(diskio_weighted_io_time[10m]) > 2000
+        rate(diskio_weighted_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 10
+      for: 10m
       labels:
         severity: warning
         service: system
       annotations:
-        summary: "Disk {{ $labels.name }} requests waited 2 seconds"
-        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 2 seconds on the device during the last 10 minutes."
+        summary: "Disk {{ $labels.name }} backlog warning"
+        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node exceeded concurrency level of 10 during the last 10 minutes."
     SystemDiskBacklogCritical:
       if: >-
-        increase(diskio_weighted_io_time[10m]) > 5000
+        rate(diskio_weighted_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 20
+      for: 10m
       labels:
         severity: critical
         service: system
       annotations:
-        summary: "Disk {{ $labels.name }} requests waited 5 seconds"
-        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 5 seconds on the device during the last 10 minutes."
+        summary: "Disk {{ $labels.name }} backlog critical"
+        description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node exceeded concurrency level of 20 during the last 10 minutes."
     SystemDiskRequestQueuedWarning:
       if: >-
-        increase(diskio_io_time[10m]) > 0.9 * 10 * 60 * 1000
+        rate(diskio_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 0.9
+      for: 10m
       labels:
         severity: warning
         service: system
@@ -177,7 +180,8 @@
         description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 90% of the device time during the last 10 minutes."
     SystemDiskRequestQueuedCritical:
       if: >-
-        increase(diskio_io_time[10m]) > 0.98 * 10 * 60 * 1000
+        rate(diskio_io_time{name=~"(hd[a-z]?|sd[a-z]?|nvme[0-9]?[a-z]?[0-9]?)"}[1m]) / 1000 > 0.98
+      for: 10m
       labels:
         severity: critical
         service: system