Ondrej Smola | 03ff34e | 2016-12-01 01:30:33 +0100 | [diff] [blame^] | 1 | parameters: |
| 2 | heka: |
| 3 | metric_collector: |
| 4 | trigger: |
| 5 | # Override the linux_system_cpu_critical and linux_system_cpu_warning |
| 6 | # triggers to use specific rules on control nodes |
| 7 | linux_system_cpu_critical: |
| 8 | description: 'The CPU usage is too high (controller node)' |
| 9 | severity: critical |
| 10 | rules: |
| 11 | - metric: cpu_idle |
| 12 | relational_operator: '<=' |
| 13 | threshold: 5 |
| 14 | window: 120 |
| 15 | periods: 0 |
| 16 | function: avg |
| 17 | - metric: cpu_wait |
| 18 | relational_operator: '>=' |
| 19 | threshold: 35 |
| 20 | window: 120 |
| 21 | periods: 0 |
| 22 | function: avg |
| 23 | linux_system_cpu_warning: |
| 24 | description: 'The CPU usage is high (controller node)' |
| 25 | severity: 'warning' |
| 26 | enabled: 'true' |
| 27 | rules: |
| 28 | - metric: cpu_idle |
| 29 | relational_operator: '<=' |
| 30 | threshold: 15 |
| 31 | window: 120 |
| 32 | periods: 0 |
| 33 | function: avg |
| 34 | - metric: cpu_wait |
| 35 | relational_operator: '>=' |
| 36 | threshold: 25 |
| 37 | window: 120 |
| 38 | periods: 0 |
| 39 | function: avg |
| 40 | alarm: |
| 41 | # Tag all the system alarm metrics with "node_role: control". This |
| 42 | # to be able to create an alarm cluster for control nodes. |
| 43 | linux_system_cpu: |
| 44 | alerting: enabled |
| 45 | triggers: |
| 46 | - linux_system_cpu_critical |
| 47 | - linux_system_cpu_warning |
| 48 | dimension: |
| 49 | node_role: control |
| 50 | linux_system_swap: |
| 51 | alerting: enabled |
| 52 | triggers: |
| 53 | - linux_system_swap_usage_critical |
| 54 | - linux_system_swap_activity_warning |
| 55 | - linux_system_swap_usage_warning |
| 56 | dimension: |
| 57 | node_role: control |
| 58 | linux_system_root_fs: |
| 59 | alerting: enabled |
| 60 | triggers: |
| 61 | - linux_system_root_fs_critical |
| 62 | - linux_system_root_fs_warning |
| 63 | dimension: |
| 64 | node_role: control |
| 65 | linux_system_network_rx: |
| 66 | alerting: enabled |
| 67 | triggers: |
| 68 | - linux_system_network_critical_dropped_rx |
| 69 | - linux_system_network_warning_dropped_rx |
| 70 | dimension: |
| 71 | node_role: control |
| 72 | linux_system_network_tx: |
| 73 | alerting: enabled |
| 74 | triggers: |
| 75 | - linux_system_network_critical_dropped_tx |
| 76 | - linux_system_network_warning_dropped_tx |
| 77 | dimension: |
| 78 | node_role: control |
| 79 | linux_system_hdd_errors: |
| 80 | alerting: enabled_with_notification |
| 81 | triggers: |
| 82 | - linux_system_hdd_errors_critical |
| 83 | dimension: |
| 84 | node_role: control |
| 85 | aggregator: |
| 86 | alarm_cluster: |
| 87 | control_nodes: |
| 88 | policy: majority_of_members |
| 89 | alerting: enabled_with_notification |
| 90 | group_by: hostname |
| 91 | match: |
| 92 | node_role: control |
| 93 | members: |
| 94 | - linux_system_cpu |
| 95 | - linux_system_swap |
| 96 | - linux_system_root_fs |
| 97 | - linux_system_network_rx |
| 98 | - linux_system_network_tx |
| 99 | - linux_system_hdd_errors |
| 100 | dimension: |
| 101 | cluster_name: control |
| 102 | nagios_host: 01-node-clusters |