blob: 4a35b4d5450aa508acf83f21ff5edb1e812d2d0e [file] [log] [blame]
log_collector:
decoder:
rabbitmq_log:
engine: sandbox
module_file: /usr/share/lma_collector/decoders/rabbitmq.lua
module_dir: /usr/share/lma_collector/common;/usr/share/heka/lua_modules
adjust_timezone: true
input:
rabbitmq_log:
engine: logstreamer
log_directory: "/var/log/rabbitmq"
file_match: 'rabbit@(?P<Node>.+)\.log$'
differentiator: ["rabbitmq.", "Node"]
decoder: "rabbitmq_log_decoder"
splitter: "rabbitmq_log_splitter"
splitter:
rabbitmq_log:
engine: regex
delimiter: '\n\n(=[^=]+====)'
delimiter_eol: false
metric_collector:
trigger:
rabbitmq_disk_limit_critical:
description: 'RabbitMQ has reached the free disk threshold. All producers are blocked.'
severity: 'critical'
no_data_policy: 'okay'
rules:
- metric: rabbitmq_remaining_disk
relational_operator: '<='
threshold: 0
window: 20
periods: 0
function: min
rabbitmq_disk_limit_warning:
description: 'RabbitMQ is getting close to the free disk threshold.'
severity: 'warning'
no_data_policy: 'okay'
rules:
- metric: rabbitmq_remaining_disk
relational_operator: '<='
threshold: 104857600 # 100MB
window: 20
periods: 0
function: min
rabbitmq_memory_limit_critical:
description: 'RabbitMQ has reached the memory threshold. All producers are blocked.'
severity: 'critical'
no_data_policy: 'okay'
rules:
- metric: rabbitmq_remaining_memory
relational_operator: '<='
threshold: 0
window: 20
periods: 0
function: min
rabbitmq_memory_limit_warning:
description: 'RabbitMQ is getting close to the memory threshold.'
severity: warning
no_data_policy: 'okay'
rules:
- metric: rabbitmq_remaining_memory
relational_operator: '<='
threshold: 104857600 # 100MB
window: 20
periods: 0
function: min
rabbitmq_queue_warning:
description: 'The number of outstanding messages is too high.'
severity: warning
no_data_policy: 'okay'
rules:
- metric: rabbitmq_messages
relational_operator: '>='
threshold: 200
window: 120
periods: 0
function: avg
rabbitmq_check:
description: 'RabbitMQ cannot be checked'
severity: down
rules:
- metric: rabbitmq_check
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
alarm:
rabbitmq_server_disk:
alerting: enabled
triggers:
- rabbitmq_disk_limit_critical
- rabbitmq_disk_limit_warning
dimension:
service: rabbitmq-cluster
rabbitmq_server_memory:
alerting: enabled
triggers:
- rabbitmq_memory_limit_critical
- rabbitmq_memory_limit_warning
dimension:
service: rabbitmq-cluster
rabbitmq_server_queue:
alerting: enabled
triggers:
- rabbitmq_queue_warning
dimension:
service: rabbitmq-cluster
rabbitmq_check:
alerting: enabled
triggers:
- rabbitmq_check
aggregator:
alarm_cluster:
rabbitmq_cluster:
alerting: enabled
policy: highest_severity
# A 'hostname' group_by is required because an alarm on a single node has
# an impact on the whole cluster.
group_by: hostname
match:
service: rabbitmq-cluster
members:
- rabbitmq_server_disk
- rabbitmq_server_memory
- rabbitmq_server_queue
dimension:
service: rabbitmq
nagios_host: 01-service-clusters
rabbitmq_service:
# A check failure on a single node doesn't mean that the whole cluster
# is down, this is why a 'hostname' group_by and 'availability_of_members'
# policy are used here
policy: availability_of_members
alerting: enabled
group_by: hostname
match:
member: rabbitmq_check
members:
- rabbitmq_check
dimension:
service: rabbitmq
nagios_host: 01-service-clusters
rabbitmq:
policy: highest_severity
alerting: enabled_with_notification
match:
service: rabbitmq
members:
- rabbitmq_cluster
- rabbitmq_service
dimension:
cluster_name: rabbitmq
nagios_host: 00-top-clusters