blob: d4fda7dd5c36b3b05c713c5b1a30608d572d1fab [file] [log] [blame]
parameters:
heka:
metric_collector:
trigger:
# Override the linux_system_cpu_critical and linux_system_cpu_warning
# triggers to use specific rules on control nodes
linux_system_cpu_critical:
description: 'The CPU usage is too high (compute node)'
severity: critical
rules:
- metric: cpu_wait
relational_operator: '>='
threshold: 30
window: 120
periods: 0
function: avg
linux_system_cpu_warning:
description: 'The CPU usage is high (compute node)'
severity: 'warning'
enabled: 'true'
rules:
- metric: cpu_wait
relational_operator: '>='
threshold: 20
window: 120
periods: 0
function: avg
alarm:
# Tag all the system alarm metrics with "node_role: compute". This
# to be able to create an alarm cluster for compute nodes.
linux_system_cpu:
alerting: enabled
triggers:
- linux_system_cpu_critical
- linux_system_cpu_warning
dimension:
node_role: compute
linux_system_swap:
alerting: enabled
triggers:
- linux_system_swap_usage_critical
- linux_system_swap_activity_warning
- linux_system_swap_usage_warning
dimension:
node_role: compute
linux_system_root_fs:
alerting: enabled
triggers:
- linux_system_root_fs_critical
- linux_system_root_fs_warning
dimension:
node_role: compute
linux_system_network_rx:
alerting: enabled
triggers:
- linux_system_network_critical_dropped_rx
- linux_system_network_warning_dropped_rx
dimension:
node_role: compute
linux_system_network_tx:
alerting: enabled
triggers:
- linux_system_network_critical_dropped_tx
- linux_system_network_warning_dropped_tx
dimension:
node_role: compute
linux_system_hdd_errors:
alerting: enabled_with_notification
triggers:
- linux_system_hdd_errors_critical
dimension:
node_role: compute
aggregator:
alarm_cluster:
compute_nodes:
policy: majority_of_members
alerting: enabled_with_notification
group_by: hostname
match:
node_role: compute
members:
- linux_system_cpu
- linux_system_swap
- linux_system_root_fs
- linux_system_network_rx
- linux_system_network_tx
- linux_system_hdd_errors
dimension:
cluster_name: compute
nagios_host: 01-node-clusters