Add log_messages metric and alert
Change-Id: I6f428332d43c4233dd378d0ee26d3b34e72c85d6
Closes-Bug: PROD-20267
diff --git a/rabbitmq/meta/fluentd.yml b/rabbitmq/meta/fluentd.yml
index 0b75d10..823921e 100644
--- a/rabbitmq/meta/fluentd.yml
+++ b/rabbitmq/meta/fluentd.yml
@@ -32,6 +32,35 @@
match:
push_to_default:
tag: rabbitmq
+ type: copy
+ store:
+ - type: relabel
+ label: default_output
+ - type: rewrite_tag_filter
+ rule:
+ - name: severity_label
+ regexp: '.'
+ result: metric.rabbitmq_log_messages
+ push_to_metric:
+ tag: metric.rabbitmq_log_messages
type: relabel
- label: default_output
+ label: default_metric
+ default_metric:
+ filter:
+ rabbitmq_logs_per_severity:
+ tag: metric.rabbitmq_log_messages
+ require:
+ - add_general_fields
+ type: prometheus
+ metric:
+ - name: log_messages
+ type: counter
+ desc: Total number of log lines by severity
+ label:
+ - name: service
+ value: rabbitmq
+ - name: level
+ value: ${severity_label}
+ - name: host
+ value: ${Hostname}
{%- endif %}
diff --git a/rabbitmq/meta/prometheus.yml b/rabbitmq/meta/prometheus.yml
index c39d5af..56a12ec 100644
--- a/rabbitmq/meta/prometheus.yml
+++ b/rabbitmq/meta/prometheus.yml
@@ -76,5 +76,15 @@
annotations:
summary: "RabbitMQ has reached the limit of {%- endraw %}{{ msg_threshold }}{%- raw %} messages"
description: "The RabbitMQ service on the {{ $labels.host }} node has received {{ $value }} messages."
+ RabbitmqErrorLogsTooHigh:
+ if: >-
+ sum(rate(log_messages{service="rabbitmq",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.2
+ for: 1m
+ labels:
+ severity: warning
+ service: rabbitmq
+ annotations:
+ summary: "Too many errors in RabbitMQ logs"
+ description: "The rate of errors in RabbitMQ logs over the last 5 minutes is too high on node {{ $labels.host }}."
{%- endraw %}
{%- endif %}