| parameters: | |
| heka: | |
| metric_collector: | |
| trigger: | |
| # Override the linux_system_cpu_critical and linux_system_cpu_warning | |
| # triggers to use specific rules on control nodes | |
| linux_system_cpu_critical: | |
| description: 'The CPU usage is too high (controller node)' | |
| severity: critical | |
| rules: | |
| - metric: cpu_idle | |
| relational_operator: '<=' | |
| threshold: 5 | |
| window: 120 | |
| periods: 0 | |
| function: avg | |
| - metric: cpu_wait | |
| relational_operator: '>=' | |
| threshold: 35 | |
| window: 120 | |
| periods: 0 | |
| function: avg | |
| linux_system_cpu_warning: | |
| description: 'The CPU usage is high (controller node)' | |
| severity: 'warning' | |
| enabled: 'true' | |
| rules: | |
| - metric: cpu_idle | |
| relational_operator: '<=' | |
| threshold: 15 | |
| window: 120 | |
| periods: 0 | |
| function: avg | |
| - metric: cpu_wait | |
| relational_operator: '>=' | |
| threshold: 25 | |
| window: 120 | |
| periods: 0 | |
| function: avg | |
| alarm: | |
| # Tag all the system alarm metrics with "node_role: control". This | |
| # to be able to create an alarm cluster for control nodes. | |
| linux_system_cpu: | |
| alerting: enabled | |
| triggers: | |
| - linux_system_cpu_critical | |
| - linux_system_cpu_warning | |
| dimension: | |
| node_role: control | |
| linux_system_swap: | |
| alerting: enabled | |
| triggers: | |
| - linux_system_swap_usage_critical | |
| - linux_system_swap_activity_warning | |
| - linux_system_swap_usage_warning | |
| dimension: | |
| node_role: control | |
| linux_system_root_fs: | |
| alerting: enabled | |
| triggers: | |
| - linux_system_root_fs_critical | |
| - linux_system_root_fs_warning | |
| dimension: | |
| node_role: control | |
| linux_system_network_rx: | |
| alerting: enabled | |
| triggers: | |
| - linux_system_network_critical_dropped_rx | |
| - linux_system_network_warning_dropped_rx | |
| dimension: | |
| node_role: control | |
| linux_system_network_tx: | |
| alerting: enabled | |
| triggers: | |
| - linux_system_network_critical_dropped_tx | |
| - linux_system_network_warning_dropped_tx | |
| dimension: | |
| node_role: control | |
| linux_system_hdd_errors: | |
| alerting: enabled_with_notification | |
| triggers: | |
| - linux_system_hdd_errors_critical | |
| dimension: | |
| node_role: control | |
| aggregator: | |
| alarm_cluster: | |
| control_nodes: | |
| policy: status_of_members | |
| alerting: enabled_with_notification | |
| group_by: hostname | |
| match: | |
| node_role: control | |
| members: | |
| - linux_system_cpu | |
| - linux_system_swap | |
| - linux_system_root_fs | |
| - linux_system_network_rx | |
| - linux_system_network_tx | |
| - linux_system_hdd_errors | |
| dimension: | |
| cluster_name: control | |
| nagios_host: 01-node-clusters |