parameters: | |
heka: | |
metric_collector: | |
trigger: | |
# Override the linux_system_cpu_critical and linux_system_cpu_warning | |
# triggers to use specific rules on control nodes | |
linux_system_cpu_critical: | |
description: 'The CPU usage is too high (compute node)' | |
severity: critical | |
rules: | |
- metric: cpu_wait | |
relational_operator: '>=' | |
threshold: 30 | |
window: 120 | |
periods: 0 | |
function: avg | |
linux_system_cpu_warning: | |
description: 'The CPU usage is high (compute node)' | |
severity: 'warning' | |
enabled: 'true' | |
rules: | |
- metric: cpu_wait | |
relational_operator: '>=' | |
threshold: 20 | |
window: 120 | |
periods: 0 | |
function: avg | |
alarm: | |
# Tag all the system alarm metrics with "node_role: compute". This | |
# to be able to create an alarm cluster for compute nodes. | |
linux_system_cpu: | |
alerting: enabled | |
triggers: | |
- linux_system_cpu_critical | |
- linux_system_cpu_warning | |
dimension: | |
node_role: compute | |
linux_system_swap: | |
alerting: enabled | |
triggers: | |
- linux_system_swap_usage_critical | |
- linux_system_swap_activity_warning | |
- linux_system_swap_usage_warning | |
dimension: | |
node_role: compute | |
linux_system_root_fs: | |
alerting: enabled | |
triggers: | |
- linux_system_root_fs_critical | |
- linux_system_root_fs_warning | |
dimension: | |
node_role: compute | |
linux_system_network_rx: | |
alerting: enabled | |
triggers: | |
- linux_system_network_critical_dropped_rx | |
- linux_system_network_warning_dropped_rx | |
dimension: | |
node_role: compute | |
linux_system_network_tx: | |
alerting: enabled | |
triggers: | |
- linux_system_network_critical_dropped_tx | |
- linux_system_network_warning_dropped_tx | |
dimension: | |
node_role: compute | |
linux_system_hdd_errors: | |
alerting: enabled_with_notification | |
triggers: | |
- linux_system_hdd_errors_critical | |
dimension: | |
node_role: compute | |
aggregator: | |
alarm_cluster: | |
compute_nodes: | |
policy: majority_of_members | |
alerting: enabled_with_notification | |
group_by: hostname | |
match: | |
node_role: compute | |
members: | |
- linux_system_cpu | |
- linux_system_swap | |
- linux_system_root_fs | |
- linux_system_network_rx | |
- linux_system_network_tx | |
- linux_system_hdd_errors | |
dimension: | |
cluster_name: compute | |
nagios_host: 01-node-clusters |