Add CPU steal time alerts
- warning for cpu_usage_steal>5 for 5 minutes
- critical for cpu_usage_steal>10 for 5 minutes
Change-Id: I6fc5c7ed369655b88a5da8c9e2821f84cbc5b509
Related-bug: PROD-32803
diff --git a/linux/map.jinja b/linux/map.jinja
index 44cacc1..a969268 100644
--- a/linux/map.jinja
+++ b/linux/map.jinja
@@ -414,6 +414,10 @@
'cpu_usage_percentage': {
'warn': 90.0,
},
+ 'cpu_steal_percentage': {
+ 'warn': 5.0,
+ 'crit': 10.0,
+ },
'memory_usage_percentage': {
'warn': 90.0,
'major': 95.0,
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index 03e0cca..2fe7036 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -1,7 +1,30 @@
{%- from "linux/map.jinja" import monitoring, network with context %}
server:
alert:
+ {%- set cpu_steal_warn = monitoring.cpu_steal_percentage.warn|float %}
+ {%- set cpu_steal_crit = monitoring.cpu_steal_percentage.crit|float %}
+ SystemCpuStealTimeWarning:
+ if: >-
+ cpu_usage_steal > {{ cpu_steal_warn }}
+ for: 5m
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: "CPU steal time warning"
+ description: "The CPU steal time was above {{ cpu_steal_warn }}% on the {%- raw %} {{ $labels.host }}{%- endraw %} node for 5 minutes."
+ SystemCpuStealTimeCritical:
+ if: >-
+ cpu_usage_steal > {{ cpu_steal_crit }}
+ for: 5m
+ labels:
+ severity: critical
+ service: system
+ annotations:
+ summary: "CPU steal time critical"
+ description: "The CPU steal time was above {{ cpu_steal_crit }}% on the {%- raw %} {{ $labels.host }} node for 5 minutes."
SystemCpuFullWarning:
+ {%- endraw %}
{%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn|float %}
if: >-
100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}