Updated PacketsDroppedByCpu* alerts
- changed their priority
- changed the 24h frame to 10 mins
Change-Id: I8c2cf1cd8a9fd985c8e4d77004179a92dbb7d5fd
Closes-Bug: PROD-27298
diff --git a/linux/map.jinja b/linux/map.jinja
index b24e9ce..a485c76 100644
--- a/linux/map.jinja
+++ b/linux/map.jinja
@@ -448,10 +448,6 @@
'failed_auths_threshold': {
'warn': 5,
},
- 'netdev_budget_squeeze_rate': 0.1,
- 'packets_dropped_per_cpu_threshold': {
- 'minor': '0',
- 'major': '100'
- }
+ 'netdev_budget_squeeze_rate': 0.1
},
}, grain='os_family', merge=salt['pillar.get']('linux:monitoring')) %}
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index f405367..753a587 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -198,28 +198,24 @@
annotations:
summary: "{{ threshold }}{%- raw %} failed SSH logins"
description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
- PacketsDroppedByCpuMinor:
- {%- endraw %}
- {%- set packets_dropped_minor_threshold = monitoring.packets_dropped_per_cpu_threshold.minor %}
+ PacketsDroppedByCpuWarning:
if: >-
- floor(increase(nstat_packet_drop[24h])) > {{ packets_dropped_minor_threshold }}
+ floor(increase(nstat_packet_drop[10m])) > 0
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: "Increased number of CPU dropped packets"
+ description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 10 minutes."
+ PacketsDroppedByCpuMinor:
+ if: >-
+ floor(increase(nstat_packet_drop[10m])) > 100
labels:
severity: minor
service: system
annotations:
- summary: "CPU dropped {{ packets_dropped_minor_threshold }}{%- raw %} packets"
- description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 24 hours."
- PacketsDroppedByCpuMajor:
- {%- endraw %}
- {%- set packets_dropped_major_threshold = monitoring.packets_dropped_per_cpu_threshold.major %}
- if: >-
- floor(increase(nstat_packet_drop[24h])) > {{ packets_dropped_major_threshold }}
- labels:
- severity: major
- service: system
- annotations:
- summary: "CPU dropped {{ packets_dropped_major_threshold }}{%- raw %} packets"
- description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 24 hours."
+ summary: "CPU dropped more than 100 packets"
+ description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 10 minutes."
NetdevBudgetRanOutsWarning:
{%- endraw %}
{%- set squeeze_rate_threshold = monitoring.netdev_budget_squeeze_rate %}