Fix Zookeeper alert
This change replaces the alert on the procstat metric by an alert on
the zookeeper_up metric which covers more error cases (eg the process
is alive but Zookeeper is stuck).
Change-Id: I0e38b47e0391951b42b999942a4a7423088cf5a5
diff --git a/opencontrail/meta/prometheus.yml b/opencontrail/meta/prometheus.yml
index c53f9c4..6f86598 100644
--- a/opencontrail/meta/prometheus.yml
+++ b/opencontrail/meta/prometheus.yml
@@ -31,7 +31,7 @@
{%- if database.get('enabled', False) %}
{%- set database_processes = (
- 'zookeeper-server', 'kafka-server', 'cassandra-server',
+ 'kafka-server', 'cassandra-server',
'contrail-nodemgr-database', 'contrail-supervisord-database',
) %}
{%- endif %}
@@ -495,6 +495,20 @@
description: '{{ contrail_process }} service is down on node {% raw %}{{ $labels.host }}{% endraw %}'
{%- endfor %}
{%- endif %}
+
+ {%- if database.get('enabled', False) %}
+ ZookeeperDown:
+ if: >-
+ zookeeper_up != 1
+ for: 2m
+ labels:
+ severity: warning
+ service: zookeeper
+ annotations:
+ summary: 'Zookeeper service down'
+ description: 'Zookeeper service is down on node {{ $labels.host }}.'
+ {%- endif %}
+
{%- if exporters is defined %}
{%- include "prometheus/_exporters_config.sls" %}
{%- endif %}