Enhance Rabbitmq log errors alerts
Change-Id: I0d1314832c0e073eed6189fae6dd3fa0d38ccc91
Related-PROD: PROD-27880
Related-PROD: PROD-31241
diff --git a/rabbitmq/meta/prometheus.yml b/rabbitmq/meta/prometheus.yml
index 27a3cde..88cce19 100644
--- a/rabbitmq/meta/prometheus.yml
+++ b/rabbitmq/meta/prometheus.yml
@@ -94,12 +94,21 @@
description: "The RabbitMQ service on the {{ $labels.host }} node has received {{ $value }} messages."
RabbitmqErrorLogsTooHigh:
if: >-
- sum(rate(log_messages{service="rabbitmq",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.2
+ sum(rate(log_messages{service="rabbitmq",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.05
labels:
- severity: major
+ severity: critical
service: rabbitmq
annotations:
summary: "Too many errors in RabbitMQ logs"
description: "The rate of errors in RabbitMQ logs is too high on the {{ $labels.host }} node (as measured over the last 5 minutes)."
+ RabbitmqErrorLogsMajor:
+ if: >-
+ sum(increase(log_messages{service="rabbitmq",level=~"(?i:(error|emergency|fatal))"}[30m])) without (level) > 0
+ labels:
+ severity: major
+ service: rabbitmq
+ annotations:
+ summary: "RabbitMQ logs contain errors"
+ description: "The RabbitMQ logs on the {{ $labels.host }} node contain errors (as measured over the last 30 minutes)."
{%- endraw %}
{%- endif %}