Add a critical alert on low memory
Change-Id: I1c8e752de9ad3479da830706ae736df6846b977f
diff --git a/linux/map.jinja b/linux/map.jinja
index 703c9c9..29209b1 100644
--- a/linux/map.jinja
+++ b/linux/map.jinja
@@ -217,6 +217,7 @@
},
'free_memory_percentage': {
'warn': 10.0,
+ 'crit': 5.0,
},
'load_5': {
'warn': 3,
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index 779bf71..d2b3d05 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -31,16 +31,26 @@
summary: 'Free inodes for {{ $labels.path }} too low on {{ $labels.host }}'
description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
{% endraw %}
- SystemMemoryAvailableTooLow:
- {%- set mem_avail_threshold = monitoring.free_memory_percentage.warn|float %}
- if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_threshold }}
+ SystemMemoryAvailableLow:
+ {%- set mem_avail_warn_threshold = monitoring.free_memory_percentage.warn|float %}
+ if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_warn_threshold }}
{% raw %}
labels:
severity: warning
service: system
annotations:
+ summary: 'Free memory low on {{ $labels.host }}'
+ description: 'The percentage of free memory is low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_warn_threshold }}%).'
+ SystemMemoryAvailableTooLow:
+ {%- set mem_avail_crit_threshold = monitoring.free_memory_percentage.crit|float %}
+ if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_crit_threshold }}
+ {% raw %}
+ labels:
+ severity: critical
+ service: system
+ annotations:
summary: 'Free memory too low on {{ $labels.host }}'
- description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_threshold }}%).'
+ description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_crit_threshold }}%).'
SystemLoad5TooHigh:
if: system_load5 / system_n_cpus > {{ monitoring.load_5.warn }}
{% raw %}