Merge pull request #29 from simonpasquier/master
Change policy for the rabbitmq_service cluster
diff --git a/rabbitmq/meta/heka.yml b/rabbitmq/meta/heka.yml
index 6d82aed..4a35b4d 100644
--- a/rabbitmq/meta/heka.yml
+++ b/rabbitmq/meta/heka.yml
@@ -89,15 +89,15 @@
rabbitmq_server_disk:
alerting: enabled
triggers:
- - rabbitmq_disk_limit_warning
- rabbitmq_disk_limit_critical
+ - rabbitmq_disk_limit_warning
dimension:
service: rabbitmq-cluster
rabbitmq_server_memory:
alerting: enabled
triggers:
- - rabbitmq_memory_limit_warning
- rabbitmq_memory_limit_critical
+ - rabbitmq_memory_limit_warning
dimension:
service: rabbitmq-cluster
rabbitmq_server_queue:
@@ -115,6 +115,9 @@
rabbitmq_cluster:
alerting: enabled
policy: highest_severity
+ # A 'hostname' group_by is required because an alarm on a single node has
+ # an impact on the whole cluster.
+ group_by: hostname
match:
service: rabbitmq-cluster
members:
@@ -126,9 +129,9 @@
nagios_host: 01-service-clusters
rabbitmq_service:
# A check failure on a single node doesn't mean that the whole cluster
- # is down, this is why a 'hostname' group_by and 'majority_of_members'
+ # is down, this is why a 'hostname' group_by and 'availability_of_members'
# policy are used here
- policy: majority_of_members
+ policy: availability_of_members
alerting: enabled
group_by: hostname
match: