blob: 7dcb3319f64eab18417abec219fb07f04cad37bd [file] [log] [blame]
Ondrej Smola03ff34e2016-12-01 01:30:33 +01001parameters:
2 heka:
3 metric_collector:
4 trigger:
5 # Override the linux_system_cpu_critical and linux_system_cpu_warning
6 # triggers to use specific rules on control nodes
7 linux_system_cpu_critical:
8 description: 'The CPU usage is too high (controller node)'
9 severity: critical
10 rules:
11 - metric: cpu_idle
12 relational_operator: '<='
13 threshold: 5
14 window: 120
15 periods: 0
16 function: avg
17 - metric: cpu_wait
18 relational_operator: '>='
19 threshold: 35
20 window: 120
21 periods: 0
22 function: avg
23 linux_system_cpu_warning:
24 description: 'The CPU usage is high (controller node)'
25 severity: 'warning'
26 enabled: 'true'
27 rules:
28 - metric: cpu_idle
29 relational_operator: '<='
30 threshold: 15
31 window: 120
32 periods: 0
33 function: avg
34 - metric: cpu_wait
35 relational_operator: '>='
36 threshold: 25
37 window: 120
38 periods: 0
39 function: avg
40 alarm:
41 # Tag all the system alarm metrics with "node_role: control". This
42 # to be able to create an alarm cluster for control nodes.
43 linux_system_cpu:
44 alerting: enabled
45 triggers:
46 - linux_system_cpu_critical
47 - linux_system_cpu_warning
48 dimension:
49 node_role: control
50 linux_system_swap:
51 alerting: enabled
52 triggers:
53 - linux_system_swap_usage_critical
54 - linux_system_swap_activity_warning
55 - linux_system_swap_usage_warning
56 dimension:
57 node_role: control
58 linux_system_root_fs:
59 alerting: enabled
60 triggers:
61 - linux_system_root_fs_critical
62 - linux_system_root_fs_warning
63 dimension:
64 node_role: control
65 linux_system_network_rx:
66 alerting: enabled
67 triggers:
68 - linux_system_network_critical_dropped_rx
69 - linux_system_network_warning_dropped_rx
70 dimension:
71 node_role: control
72 linux_system_network_tx:
73 alerting: enabled
74 triggers:
75 - linux_system_network_critical_dropped_tx
76 - linux_system_network_warning_dropped_tx
77 dimension:
78 node_role: control
79 linux_system_hdd_errors:
80 alerting: enabled_with_notification
81 triggers:
82 - linux_system_hdd_errors_critical
83 dimension:
84 node_role: control
85 aggregator:
86 alarm_cluster:
87 control_nodes:
88 policy: majority_of_members
89 alerting: enabled_with_notification
90 group_by: hostname
91 match:
92 node_role: control
93 members:
94 - linux_system_cpu
95 - linux_system_swap
96 - linux_system_root_fs
97 - linux_system_network_rx
98 - linux_system_network_tx
99 - linux_system_hdd_errors
100 dimension:
101 cluster_name: control
102 nagios_host: 01-node-clusters