Add threshold to Kibana alarms
Change-Id: Iaf8909163b933b9019d8cf491a596271a0b78827
Partial-Bug: PROD-15203
diff --git a/kibana/map.jinja b/kibana/map.jinja
index 51e1331..93852ec 100644
--- a/kibana/map.jinja
+++ b/kibana/map.jinja
@@ -8,6 +8,13 @@
},
}, merge=salt['pillar.get']('kibana:server')) %}
+{% set monitoring = salt['grains.filter_by']({
+ 'default': {
+ 'service_failed_warning_threshold_percent': 0.3,
+ 'service_failed_critical_threshold_percent': 0.6,
+ },
+}, grain='os_family', merge=salt['pillar.get']('kibana:monitoring')) %}
+
{%- load_yaml as client_defaults %}
default:
server:
diff --git a/kibana/meta/prometheus.yml b/kibana/meta/prometheus.yml
index 8b794be..5fa6af9 100644
--- a/kibana/meta/prometheus.yml
+++ b/kibana/meta/prometheus.yml
@@ -1,17 +1,49 @@
-{%- from "kibana/map.jinja" import server with context %}
+{%- from "kibana/map.jinja" import server, monitoring with context %}
{%- if server.get('enabled', False) %}
server:
alert:
- KibanaProcessDown:
+ KibanaProcessInfo:
if: >-
procstat_running{process_name="kibana"} == 0
{% raw %}
labels:
- severity: warning
+ severity: info
service: kibana
annotations:
summary: 'Kibana service is down'
description: 'Kibana service is down on node {{ $labels.host }}'
{% endraw %}
+ KibanaProcessWarning:
+ if: >-
+ count(procstat_running{process_name="kibana"} == 0) >= count(procstat_running{process_name="kibana"}) * {{ monitoring.service_failed_warning_threshold_percent }}
+ {% raw %}
+ labels:
+ severity: warning
+ service: kibana
+ annotations:
+ summary: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Kibana services are down'
+ description: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Kibana services are down'
+ {% endraw %}
+ KibanaProcessCritical:
+ if: >-
+ count(procstat_running{process_name="kibana"} == 0) >= count(procstat_running{process_name="kibana"}) * {{ monitoring.service_failed_critical_threshold_percent }}
+ {% raw %}
+ labels:
+ severity: critical
+ service: kibana
+ annotations:
+ summary: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Kibana services are down'
+ description: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Kibana services are down'
+ {% endraw %}
+ KibanaProcessDown:
+ if: >-
+ count(procstat_running{process_name="kibana"} == 0) == count(procstat_running{process_name="kibana"})
+ {% raw %}
+ labels:
+ severity: down
+ service: kibana
+ annotations:
+ summary: 'All Kibana services are down'
+ description: 'All Kibana services are down'
+ {% endraw %}
{%- endif %}
-