Extend procstat_running-based metric alerts.
Related-PROD: PROD-35435
Change-Id: I662ae550f13d8d1039eeb6243726707d8a15ccbd
diff --git a/opencontrail/meta/prometheus.yml b/opencontrail/meta/prometheus.yml
index 7f8ef15..5173d38 100644
--- a/opencontrail/meta/prometheus.yml
+++ b/opencontrail/meta/prometheus.yml
@@ -93,6 +93,7 @@
ContrailProcessDown:
if: >-
procstat_running{process_name=~"contrail.*"} == 0
+ for: 2m
labels:
severity: minor
service: contrail
@@ -104,6 +105,7 @@
ContrailProcessDownMinor:
if: >-
count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) >= {{ monitoring.services_failed_warning_threshold_percent }}*count(procstat_running{process_name=~"contrail.*"}) by (process_name)
+ for: 2m
labels:
severity: minor
service: contrail
@@ -115,6 +117,7 @@
ContrailProcessDownMajor:
if: >-
count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) >= {{ monitoring.services_failed_critical_threshold_percent }}*count(procstat_running{process_name=~"contrail.*"}) by (process_name)
+ for: 2m
labels:
severity: major
service: contrail
@@ -126,6 +129,7 @@
ContrailProcessOutage:
if: >-
count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) == count(procstat_running{process_name=~"contrail.*"}) by (process_name)
+ for: 2m
labels:
severity: critical
service: contrail
@@ -369,6 +373,7 @@
RedisServiceDown:
if: >-
procstat_running{process_name="redis-server"} == 0
+ for: 2m
labels:
severity: minor
service: redis
@@ -380,6 +385,7 @@
if: >-
count(procstat_running{process_name="redis-server"} == 0) >= count(procstat_running{process_name="redis-server"}) *{{ monitoring.services_failed_warning_threshold_percent }}
{%- raw %}
+ for: 2m
labels:
severity: minor
service: redis
@@ -390,6 +396,7 @@
if: >-
count(procstat_running{process_name="redis-server"} == 0) >= count(procstat_running{process_name="redis-server"}) *{{ monitoring.services_failed_critical_threshold_percent }}
{%- raw %}
+ for: 2m
labels:
severity: major
service: redis
@@ -399,6 +406,7 @@
RedisServiceOutage:
if: >-
count(procstat_running{process_name="redis-server"} == 0) == count(procstat_running{process_name="redis-server"})
+ for: 2m
labels:
severity: critical
service: redis
@@ -412,6 +420,7 @@
CassandraServiceDown:
if: >-
procstat_running{process_name="cassandra-server"} == 0
+ for: 2m
labels:
severity: minor
service: cassandra
@@ -423,6 +432,7 @@
if: >-
count(procstat_running{process_name="cassandra-server"} == 0) >= count(procstat_running{process_name="cassandra-server"}) *{{ monitoring.services_failed_warning_threshold_percent }}
{%- raw %}
+ for: 2m
labels:
severity: minor
service: cassandra
@@ -433,6 +443,7 @@
if: >-
count(procstat_running{process_name="cassandra-server"} == 0) >= count(procstat_running{process_name="cassandra-server"}) *{{ monitoring.services_failed_critical_threshold_percent }}
{%- raw %}
+ for: 2m
labels:
severity: major
service: cassandra
@@ -442,6 +453,7 @@
CassandraServiceOutage:
if: >-
count(procstat_running{process_name="cassandra-server"} == 0) == count(procstat_running{process_name="cassandra-server"})
+ for: 2m
labels:
severity: critical
service: cassandra
@@ -451,6 +463,7 @@
KafkaServiceDown:
if: >-
procstat_running{process_name="kafka-server"} == 0
+ for: 2m
labels:
severity: minor
service: kafka
@@ -461,6 +474,7 @@
KafkaServiceDownMinor:
if: >-
count(procstat_running{process_name="kafka-server"} == 0) >= count(procstat_running{process_name="kafka-server"}) *{{ monitoring.services_failed_warning_threshold_percent }}
+ for: 2m
labels:
severity: minor
service: kafka
@@ -473,6 +487,7 @@
if: >-
count(procstat_running{process_name="kafka-server"} == 0) >= count(procstat_running{process_name="kafka-server"}) *{{ monitoring.services_failed_critical_threshold_percent }}
{%- raw %}
+ for: 2m
labels:
severity: major
service: kafka
@@ -482,6 +497,7 @@
KafkaServiceOutage:
if: >-
count(procstat_running{process_name="kafka-server"} == 0) == count(procstat_running{process_name="kafka-server"})
+ for: 2m
labels:
severity: critical
service: kafka