blob: 378faa3ec20469a61b8c2f66ba5d6e3ca31d0291 [file] [log] [blame]
Jakub Pavlike7d12cd2015-09-03 19:02:45 +02001
2============
3Heka Formula
4============
5
Ales Komarekc9a3eb12016-10-12 11:17:55 +02006Heka is an open source stream processing software system developed by Mozilla. Heka is a Swiss Army Knife type tool for data processing.
Jakub Pavlike7d12cd2015-09-03 19:02:45 +02007
8Sample pillars
9==============
10
Ales Komarekc9a3eb12016-10-12 11:17:55 +020011Metric collector service
Ales Komarekf8d248e2016-10-21 10:27:28 +020012------------------------
13
Ales Komareke2b62602016-10-21 13:24:10 +020014Local alarm definition for nova compute role, excerpt from `nova/meta/heka.yml`.
jan kaufman1002cd92015-09-16 16:30:48 +020015
16.. code-block:: yaml
17
jan kaufman1002cd92015-09-16 16:30:48 +020018 heka:
Ales Komarekc9a3eb12016-10-12 11:17:55 +020019 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +020020 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +020021 nova_compute_filesystem_warning:
Ales Komareke2b62602016-10-21 13:24:10 +020022 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +020023 enabled: True # implicit
24 description: "The nova instance filesystem's root free space is low."
25 severity: warning
26 logical_operator: or # implicit
27 rules:
28 - metric: fs_space_percent_free
29 relational_operator: '<'
30 threshold: 10
31 window: 60
32 periods: 0
33 function: min
34 fs: '/var/lib/nova'
35 nova_compute_filesystem_critical:
Ales Komareke2b62602016-10-21 13:24:10 +020036 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +020037 enabled: True # implicit
38 description: "The nova instance filesystem's root free space is low."
39 severity: warning
40 logical_operator: or # implicit
41 rules:
42 - metric: fs_space_percent_free
43 relational_operator: '<'
44 threshold: 5
45 window: 60
46 periods: 0
47 function: min
48 fs: '/var/lib/nova'
Ales Komareke2b62602016-10-21 13:24:10 +020049 filter:
Ales Komarekf8d248e2016-10-21 10:27:28 +020050 nova_compute_service:
Ales Komareke2b62602016-10-21 13:24:10 +020051 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +020052 notifications: False
53 alerting: True
54 trigger:
55 vip:
56 - nova_compute_filesystem_warning
57 - nova_compute_filesystem_critical
58 - nova_compute_filesystem_critical
Ales Komareke2b62602016-10-21 13:24:10 +020059 aggregator:
60 filter:
61 nova_compute: # the service_role format
62 engine: gse
63 policy: highest_severity
64 group_by: member
65 members:
66 - nova_compute_logs
67 - nova_compute_service
68 - nova_compute_instances
69 - nova_compute_libvirt
70 - nova_compute_free_cpu
71 - nova_compute_free_mem
72 hints:
73 - neutron_compute # or contrail_vrouter for contrail nodes
Ales Komarekc9a3eb12016-10-12 11:17:55 +020074
Ales Komareke2b62602016-10-21 13:24:10 +020075Default CPU usage alarms, excerpt from `linux/meta/heka.yml`.
Ales Komarekf8d248e2016-10-21 10:27:28 +020076
77.. code-block:: yaml
78
79 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +020080 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +020081 linux_system_cpu_critical:
Ales Komareke2b62602016-10-21 13:24:10 +020082 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +020083 enabled: True # implicit
84 description: 'The CPU usage is too high.'
85 severity: critical
86 label:
87 hostname: '$match_by.hostname'
88 node_role: controller
89 match_by: ['hostname']
90 rules:
91 - metric: cpu_wait
92 relational_operator: >=
93 threshold: 35
94 window: 120
95 periods: 0
96 function: avg
97 - metric: cpu_idle
98 relational_operator: <=
99 threshold: 5
100 window: 120
101 function: avg
102 linux_system_cpu_warning:
Ales Komareke2b62602016-10-21 13:24:10 +0200103 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +0200104 enabled: True # implicit
105 description: 'The CPU wait times are high.'
106 severity: critical
107 label:
108 hostname: '$match_by.hostname'
109 node_role: controller
110 match_by: ['hostname']
111 rules:
112 - metric: cpu_wait
113 relational_operator: >=
114 threshold: 15
115 window: 120
116 periods: 0
117 function: avg
Ales Komareke2b62602016-10-21 13:24:10 +0200118 filter:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200119 linux_system_cpu:
Ales Komareke2b62602016-10-21 13:24:10 +0200120 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +0200121 notifications: False
122 alerting: True
123 trigger:
124 vip:
125 - linux_system_cpu_warning # will not render if referenced trigger is disabled
126 - linux_system_cpu_critical
127
Ales Komareke2b62602016-10-21 13:24:10 +0200128CPU usage override for compute node, excerpt from `nova/meta/heka.yml`.
Ales Komarekf8d248e2016-10-21 10:27:28 +0200129
130.. code-block:: yaml
131
132 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +0200133 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200134 nova_compute_cpu_critical:
Ales Komareke2b62602016-10-21 13:24:10 +0200135 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +0200136 enabled: True # implicit
137 description: 'The CPU wait times are too high.'
138 severity: critical
139 label:
140 hostname: '$match_by.hostname'
141 node_role: controller
142 match_by: ['hostname']
143 rules:
144 - metric: cpu_wait
145 relational_operator: >=
146 threshold: 35
147 window: 120
148 periods: 0
149 function: avg
150
151.. code-block:: yaml
152
153Alarm override option 1 - override:
154
155.. code-block:: yaml
156
157 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +0200158 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200159 # Trigger can be disable
Ales Komareke2b62602016-10-21 13:24:10 +0200160 linux_system_cpu_critical:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200161 enabled: False
Ales Komareke2b62602016-10-21 13:24:10 +0200162 filter:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200163 #Alarm can be overriden
164 linux_system_cpu:
165 trigger:
166 vip:
167 - nova_compute_cpu_critical
168
169Alarm override option 2 - reinitialize:
170
171.. code-block:: yaml
172
173 metric_collector:
174 filter:
175 ...
176 # Alarm is disabled
177 linux_system_cpu:
178 enabled: False
179 # new alarm is created
180 nova_compute_cpu:
181 engine: afd_alarm
182 notifications: False
183 alerting: True
184 trigger:
185 vip:
186 - linux_system_cpu_warning # will not render if referenced trigger is disabled
187 - nova_compute_cpu_critical
188
189
190Remote collector service
191------------------------
192
Ales Komareke2b62602016-10-21 13:24:10 +0200193Remote API check example, excerpt from `nova/meta/heka.yml`.
Ales Komarekf8d248e2016-10-21 10:27:28 +0200194
195.. code-block:: yaml
196
197 heka:
198 remote_collector:
Ales Komareke2b62602016-10-21 13:24:10 +0200199 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200200 nova_control_api_fail:
Ales Komareke2b62602016-10-21 13:24:10 +0200201 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +0200202 description: 'Endpoint check for nova-api failed.'
203 severity: critical
204 alerting: True
205 label:
206 hostname: '$match_by.hostname'
207 node_role: controller
208 match_by: ['hostname']
209 rules:
210 - metric: openstack_check_api
211 relational_operator: '=='
212 threshold: 0
213 window: 60
214 periods: 0
215 function: last
216 service: 'nova-api'
Ales Komareke2b62602016-10-21 13:24:10 +0200217 filter:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200218 nova_control_api:
Ales Komareke2b62602016-10-21 13:24:10 +0200219 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +0200220 notifications: False
221 alerting: True
222 trigger:
223 vip:
224 - nova_control_api_fail
225
Ales Komareke2b62602016-10-21 13:24:10 +0200226Corresponding clusters and alarms, excerpt from `nova/meta/heka.yml`.
Ales Komarekc9a3eb12016-10-12 11:17:55 +0200227
228.. code-block:: yaml
229
230 heka:
231 aggregator:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200232 filter:
Ales Komareke2b62602016-10-21 13:24:10 +0200233 nova_compute: # the service_role format
234 engine: gse
Ales Komarekf8d248e2016-10-21 10:27:28 +0200235 policy: highest_severity
236 group_by: member
237 members:
Ales Komareke2b62602016-10-21 13:24:10 +0200238 - nova_control_api
239 - nova_control_endpoint
240 hints:
241 - neutron_control # or contrail_vrouter for contrail nodes
242 - keystone_control
Ales Komarekc9a3eb12016-10-12 11:17:55 +0200243
Jakub Pavlike7d12cd2015-09-03 19:02:45 +0200244
245Read more
246=========
247
jan kaufman1002cd92015-09-16 16:30:48 +0200248* https://hekad.readthedocs.org/en/latest/index.html