blob: 81b8be95cf0dc75488c243a01cd6e7b214d0e58a [file] [log] [blame]
Ondrej Smola03ff34e2016-12-01 01:30:33 +01001parameters:
2 heka:
3 metric_collector:
4 trigger:
5 # Override the linux_system_cpu_critical and linux_system_cpu_warning
6 # triggers to use specific rules on control nodes
7 linux_system_cpu_critical:
8 description: 'The CPU usage is too high (compute node)'
9 severity: critical
10 rules:
11 - metric: cpu_wait
12 relational_operator: '>='
13 threshold: 30
14 window: 120
15 periods: 0
16 function: avg
17 linux_system_cpu_warning:
18 description: 'The CPU usage is high (compute node)'
19 severity: 'warning'
20 enabled: 'true'
21 rules:
22 - metric: cpu_wait
23 relational_operator: '>='
24 threshold: 20
25 window: 120
26 periods: 0
27 function: avg
28 alarm:
29 # Tag all the system alarm metrics with "node_role: compute". This
30 # to be able to create an alarm cluster for compute nodes.
31 linux_system_cpu:
32 alerting: enabled
33 triggers:
34 - linux_system_cpu_critical
35 - linux_system_cpu_warning
36 dimension:
37 node_role: compute
38 linux_system_swap:
39 alerting: enabled
40 triggers:
41 - linux_system_swap_usage_critical
42 - linux_system_swap_activity_warning
43 - linux_system_swap_usage_warning
44 dimension:
45 node_role: compute
46 linux_system_root_fs:
47 alerting: enabled
48 triggers:
49 - linux_system_root_fs_critical
50 - linux_system_root_fs_warning
51 dimension:
52 node_role: compute
53 linux_system_network_rx:
54 alerting: enabled
55 triggers:
56 - linux_system_network_critical_dropped_rx
57 - linux_system_network_warning_dropped_rx
58 dimension:
59 node_role: compute
60 linux_system_network_tx:
61 alerting: enabled
62 triggers:
63 - linux_system_network_critical_dropped_tx
64 - linux_system_network_warning_dropped_tx
65 dimension:
66 node_role: compute
67 linux_system_hdd_errors:
68 alerting: enabled_with_notification
69 triggers:
70 - linux_system_hdd_errors_critical
71 dimension:
72 node_role: compute
73 aggregator:
74 alarm_cluster:
75 compute_nodes:
Éric Lemoineb3086912016-12-14 15:15:39 +000076 policy: status_of_members
Ondrej Smola03ff34e2016-12-01 01:30:33 +010077 alerting: enabled_with_notification
78 group_by: hostname
79 match:
80 node_role: compute
81 members:
82 - linux_system_cpu
83 - linux_system_swap
84 - linux_system_root_fs
85 - linux_system_network_rx
86 - linux_system_network_tx
87 - linux_system_hdd_errors
88 dimension:
89 cluster_name: compute
90 nagios_host: 01-node-clusters