blob: e94db48ad6d44af105d78c7ded9937961e4ace9f [file] [log] [blame]
parameters:
heka:
metric_collector:
trigger:
# Override the linux_system_cpu_critical and linux_system_cpu_warning
# triggers to use specific rules on control nodes
linux_system_cpu_critical:
description: 'The CPU usage is too high (controller node)'
severity: critical
rules:
- metric: cpu_idle
relational_operator: '<='
threshold: 5
window: 120
periods: 0
function: avg
- metric: cpu_wait
relational_operator: '>='
threshold: 35
window: 120
periods: 0
function: avg
linux_system_cpu_warning:
description: 'The CPU usage is high (controller node)'
severity: 'warning'
enabled: 'true'
rules:
- metric: cpu_idle
relational_operator: '<='
threshold: 15
window: 120
periods: 0
function: avg
- metric: cpu_wait
relational_operator: '>='
threshold: 25
window: 120
periods: 0
function: avg
alarm:
# Tag all the system alarm metrics with "node_role: control". This
# to be able to create an alarm cluster for control nodes.
linux_system_cpu:
alerting: enabled
triggers:
- linux_system_cpu_critical
- linux_system_cpu_warning
dimension:
node_role: control
linux_system_swap:
alerting: enabled
triggers:
- linux_system_swap_usage_critical
- linux_system_swap_activity_warning
- linux_system_swap_usage_warning
dimension:
node_role: control
linux_system_root_fs:
alerting: enabled
triggers:
- linux_system_root_fs_critical
- linux_system_root_fs_warning
dimension:
node_role: control
linux_system_network_rx:
alerting: enabled
triggers:
- linux_system_network_critical_dropped_rx
- linux_system_network_warning_dropped_rx
dimension:
node_role: control
linux_system_network_tx:
alerting: enabled
triggers:
- linux_system_network_critical_dropped_tx
- linux_system_network_warning_dropped_tx
dimension:
node_role: control
linux_system_hdd_errors:
alerting: enabled_with_notification
triggers:
- linux_system_hdd_errors_critical
dimension:
node_role: control
aggregator:
alarm_cluster:
control_nodes:
policy: status_of_members
alerting: enabled_with_notification
group_by: hostname
match:
node_role: control
members:
- linux_system_cpu
- linux_system_swap
- linux_system_root_fs
- linux_system_network_rx
- linux_system_network_tx
- linux_system_hdd_errors
dimension:
cluster_name: control
nagios_host: 01-node-clusters