Add alarm and alarm clusters
diff --git a/rabbitmq/meta/heka.yml b/rabbitmq/meta/heka.yml
index b3855be..73515f8 100644
--- a/rabbitmq/meta/heka.yml
+++ b/rabbitmq/meta/heka.yml
@@ -75,6 +75,16 @@
window: 120
periods: 0
function: avg
+ rabbitmq_check:
+ description: 'RabbitMQ cannot be checked'
+ severity: down
+ rules:
+ - metric: rabbitmq_check
+ relational_operator: '=='
+ threshold: 0
+ window: 60
+ periods: 0
+ function: last
alarm:
rabbitmq_server_disk:
alerting: enabled
@@ -96,6 +106,10 @@
- rabbitmq_queue_warning
dimension:
service: rabbitmq-cluster
+ rabbitmq_check:
+ alerting: enabled
+ triggers:
+ - rabbitmq_check
aggregator:
alarm_cluster:
rabbitmq_cluster:
@@ -109,3 +123,24 @@
- rabbitmq_server_queue
dimension:
service: rabbitmq
+ rabbitmq_service:
+ # A check failure on a single node doesn't mean that the whole cluster
+ # is down, this is why a 'hostname' group_by and 'majority_of_members'
+ # policy are used here
+ policy: majority_of_members
+ group_by: hostname
+ match:
+ member: rabbitmq_check
+ members:
+ - rabbitmq_check
+ dimension:
+ service: rabbitmq
+ rabbitmq:
+ policy: highest_severity
+ match:
+ service: rabbitmq
+ members:
+ - rabbitmq_cluster
+ - rabbitmq_service
+ dimension:
+ cluster_name: rabbitmq