Fix Prometheus alerts on dropped packets Since metrics on dropped packets are counters, the alerts should use the rate() function. This change also fixes some inconsistencies in the alert descriptions. Change-Id: I9abbc0a49f45ba760836c436a3e7e65aa62f652e

commit: db768fb47c57f9fdad9b6f254716ab5fb35329ed [log] [tgz]
author: Simon Pasquier <spasquier@mirantis.com> Tue Jul 25 10:12:42 2017 +0200
committer: Simon Pasquier <spasquier@mirantis.com> Tue Jul 25 10:12:42 2017 +0200
tree: 69257f223e097f7fd90ebce0a716cf0940bc7f33
parent: c7b79ad6b4b6d38d23046b9dff3f5ddc302c94b4 [diff]
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index 49b7965..f6f91c2 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml

@@ -10,7 +10,7 @@
         service: system
       annotations:
         summary: 'Idle CPU usage too low on {{ $labels.host }}'
-        description: 'The average idle CPU usage is too low on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ cpu_idle_threshold}})'
+        description: 'The average idle CPU usage is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ cpu_idle_threshold}}%).'
     SystemDiskSpaceTooLow:
       if: 'predict_linear(disk_free[1h], 8*3600) < 0'
       {% raw %}
@@ -19,7 +19,7 @@
         service: system
       annotations:
         summary: 'Free space for {{ $labels.path }} too low on {{ $labels.host }}'
-        description: 'The disk partition ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}'
+        description: 'The disk partition ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
       {% endraw %}
     SystemDiskInodesTooLow:
       if: 'predict_linear(disk_inodes_free[1h], 8*3600) < 0'
@@ -29,7 +29,7 @@
         service: system
       annotations:
         summary: 'Free inodes for {{ $labels.path }} too low on {{ $labels.host }}'
-        description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}'
+        description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
       {% endraw %}
     SystemMemoryAvailableTooLow:
       {%- set mem_avail_threshold = monitoring.free_memory_percentage.warn|float %}
@@ -40,7 +40,7 @@
         service: system
       annotations:
         summary: 'Free memory too low on {{ $labels.host }}'
-        description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ mem_avail_threshold }})'
+        description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_threshold }}%).'
     SystemLoad5TooHigh:
       if: system_load5 / system_n_cpus > {{ monitoring.load_5.warn }}
       {% raw %}
@@ -49,28 +49,27 @@
         service: system
       annotations:
         summary: 'High system load (5m) on {{ $labels.host }}'
-        description: 'High system load (5m) on node {{ $labels.host }}'
-      {% endraw %}
+        description: 'The 5-minutes system load is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ monitoring.load_5.warn }}).'
     SystemRxPacketsDroppedTooHigh:
       {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_rate.warn %}
-      if: avg_over_time(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
+      if: rate(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
       {% raw %}
       labels:
         severity: warning
         service: system
       annotations:
         summary: 'Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
-        description: 'The average number of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_rx_dropped_threshold }})'
+        description: 'The rate of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_rx_dropped_threshold }}/sec)'
     SystemTxPacketsDroppedTooHigh:
       {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_rate.warn %}
-      if: avg_over_time(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
+      if: rate(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
       {% raw %}
       labels:
         severity: warning
         service: system
       annotations:
         summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
-        description: 'The average number of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_tx_dropped_threshold }})'
+        description: 'The rate of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_tx_dropped_threshold }}/sec)'
     SystemSwapUsed:
       {%- set swap_used_threshold = monitoring.swap.warn.strip('%')|float %}
       if: avg_over_time(swap_used_percent[1m]) > {{ swap_used_threshold }}
@@ -80,7 +79,7 @@
         service: system
       annotations:
         summary: 'Swap usage too high on {{ $labels.host }}'
-        description: 'The average percentage of used swap is too high on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ swap_used_threshold }})'
+        description: 'The average percentage of used swap is too high on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ swap_used_threshold }}%)'
     SystemSwapIn:
       {%- set swap_in_threshold = monitoring.swap_in_rate.warn %}
       if: rate(swap_in[2m]) > {{ swap_in_threshold }}
@@ -90,7 +89,7 @@
         service: system
       annotations:
         summary: 'Swap input throughput too high on {{ $labels.host }}'
-        description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }})'
+        description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }}b/s).'
     SystemSwapOut:
       {%- set swap_out_threshold = monitoring.swap_out_rate.warn %}
       if: rate(swap_out[2m]) > {{ swap_out_threshold }}
@@ -100,4 +99,4 @@
         service: system
       annotations:
         summary: 'Swap output throughput too high on {{ $labels.host }}'
-        description: 'The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_out_threshold }})'
+        description: 'The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_out_threshold }}b/s).'
commit	db768fb47c57f9fdad9b6f254716ab5fb35329ed	[log] [tgz]
author	Simon Pasquier <spasquier@mirantis.com>	Tue Jul 25 10:12:42 2017 +0200
committer	Simon Pasquier <spasquier@mirantis.com>	Tue Jul 25 10:12:42 2017 +0200
tree	69257f223e097f7fd90ebce0a716cf0940bc7f33
parent	c7b79ad6b4b6d38d23046b9dff3f5ddc302c94b4 [diff]