Ondrej Smola | 03ff34e | 2016-12-01 01:30:33 +0100 | [diff] [blame] | 1 | parameters: |
| 2 | heka: |
| 3 | metric_collector: |
| 4 | trigger: |
| 5 | # Override the linux_system_cpu_critical and linux_system_cpu_warning |
| 6 | # triggers to use specific rules on control nodes |
| 7 | linux_system_cpu_critical: |
| 8 | description: 'The CPU usage is too high (compute node)' |
| 9 | severity: critical |
| 10 | rules: |
| 11 | - metric: cpu_wait |
| 12 | relational_operator: '>=' |
| 13 | threshold: 30 |
| 14 | window: 120 |
| 15 | periods: 0 |
| 16 | function: avg |
| 17 | linux_system_cpu_warning: |
| 18 | description: 'The CPU usage is high (compute node)' |
| 19 | severity: 'warning' |
| 20 | enabled: 'true' |
| 21 | rules: |
| 22 | - metric: cpu_wait |
| 23 | relational_operator: '>=' |
| 24 | threshold: 20 |
| 25 | window: 120 |
| 26 | periods: 0 |
| 27 | function: avg |
| 28 | alarm: |
| 29 | # Tag all the system alarm metrics with "node_role: compute". This |
| 30 | # to be able to create an alarm cluster for compute nodes. |
| 31 | linux_system_cpu: |
| 32 | alerting: enabled |
| 33 | triggers: |
| 34 | - linux_system_cpu_critical |
| 35 | - linux_system_cpu_warning |
| 36 | dimension: |
| 37 | node_role: compute |
| 38 | linux_system_swap: |
| 39 | alerting: enabled |
| 40 | triggers: |
| 41 | - linux_system_swap_usage_critical |
| 42 | - linux_system_swap_activity_warning |
| 43 | - linux_system_swap_usage_warning |
| 44 | dimension: |
| 45 | node_role: compute |
| 46 | linux_system_root_fs: |
| 47 | alerting: enabled |
| 48 | triggers: |
| 49 | - linux_system_root_fs_critical |
| 50 | - linux_system_root_fs_warning |
| 51 | dimension: |
| 52 | node_role: compute |
| 53 | linux_system_network_rx: |
| 54 | alerting: enabled |
| 55 | triggers: |
| 56 | - linux_system_network_critical_dropped_rx |
| 57 | - linux_system_network_warning_dropped_rx |
| 58 | dimension: |
| 59 | node_role: compute |
| 60 | linux_system_network_tx: |
| 61 | alerting: enabled |
| 62 | triggers: |
| 63 | - linux_system_network_critical_dropped_tx |
| 64 | - linux_system_network_warning_dropped_tx |
| 65 | dimension: |
| 66 | node_role: compute |
| 67 | linux_system_hdd_errors: |
| 68 | alerting: enabled_with_notification |
| 69 | triggers: |
| 70 | - linux_system_hdd_errors_critical |
| 71 | dimension: |
| 72 | node_role: compute |
| 73 | aggregator: |
| 74 | alarm_cluster: |
| 75 | compute_nodes: |
Éric Lemoine | b308691 | 2016-12-14 15:15:39 +0000 | [diff] [blame] | 76 | policy: status_of_members |
Ondrej Smola | 03ff34e | 2016-12-01 01:30:33 +0100 | [diff] [blame] | 77 | alerting: enabled_with_notification |
| 78 | group_by: hostname |
| 79 | match: |
| 80 | node_role: compute |
| 81 | members: |
| 82 | - linux_system_cpu |
| 83 | - linux_system_swap |
| 84 | - linux_system_root_fs |
| 85 | - linux_system_network_rx |
| 86 | - linux_system_network_tx |
| 87 | - linux_system_hdd_errors |
| 88 | dimension: |
| 89 | cluster_name: compute |
| 90 | nagios_host: 01-node-clusters |