Add alert for out of memory errors
Change-Id: Id0440ccc3c3c167c870fccec3b2f36cdddf5904c
Depends-On: I3bf4e4e25e2a4aa84f12454fc3f5456e6bafc9a6
diff --git a/rsyslog/map.jinja b/rsyslog/map.jinja
index b5c7b8b..f7d38d4 100644
--- a/rsyslog/map.jinja
+++ b/rsyslog/map.jinja
@@ -157,5 +157,6 @@
{%- set monitoring = salt['grains.filter_by']({
'default': {
'failed_auths_per_sec': 0.2,
+ 'out_of_memory_errors_per_sec': 0.0011,
},
}, grain='os_family', merge=salt['pillar.get']('rsyslog:monitoring')) %}
diff --git a/rsyslog/meta/prometheus.yml b/rsyslog/meta/prometheus.yml
index 5740645..1564e6d 100644
--- a/rsyslog/meta/prometheus.yml
+++ b/rsyslog/meta/prometheus.yml
@@ -16,4 +16,15 @@
annotations:
summary: 'Too many failed SSH logins'
description: 'The rate of failed logins is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ threshold }}).'
+ OutOfMemoryTooHigh:
+ {%- set threshold = monitoring.out_of_memory_errors_per_sec|float %}
+ if: >-
+ rate(out_of_memory_total[5m]) > {{ threshold }}
+{%- raw %}
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: 'Too many out-of-memory errors'
+ description: 'The rate of out-of-memory errors is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ threshold }}).'
{%- endif %}