blob: c686638fd819457136bcf17a39e60e2d8e9b833c [file] [log] [blame]
Jakub Pavlike7d12cd2015-09-03 19:02:45 +02001
2============
3Heka Formula
4============
5
Ales Komarekc9a3eb12016-10-12 11:17:55 +02006Heka is an open source stream processing software system developed by Mozilla. Heka is a Swiss Army Knife type tool for data processing.
Jakub Pavlike7d12cd2015-09-03 19:02:45 +02007
8Sample pillars
9==============
10
Ales Komarekc9a3eb12016-10-12 11:17:55 +020011Log collector service
jan kaufman1002cd92015-09-16 16:30:48 +020012
Jakub Pavlike7d12cd2015-09-03 19:02:45 +020013.. code-block:: yaml
14
15 heka:
Ales Komarekc9a3eb12016-10-12 11:17:55 +020016 log_collector:
Jakub Pavlike7d12cd2015-09-03 19:02:45 +020017 enabled: true
Jakub Pavlik6014f912015-09-03 19:05:05 +020018 output:
Ales Komarekc9a3eb12016-10-12 11:17:55 +020019 elasticsearch01:
20 engine: elasticsearch
Jakub Pavlik6014f912015-09-03 19:05:05 +020021 host: localhost
Ales Komarekc9a3eb12016-10-12 11:17:55 +020022 port: 9200
23 encoder: es_json
jan kaufman1002cd92015-09-16 16:30:48 +020024 message_matcher: TRUE
jan kaufman1002cd92015-09-16 16:30:48 +020025
Ales Komarekf8d248e2016-10-21 10:27:28 +020026
Ales Komarekc9a3eb12016-10-12 11:17:55 +020027Metric collector service
Ales Komarekf8d248e2016-10-21 10:27:28 +020028------------------------
29
30Local alarm definition for nova compute role
jan kaufman1002cd92015-09-16 16:30:48 +020031
32.. code-block:: yaml
33
jan kaufman1002cd92015-09-16 16:30:48 +020034 heka:
Ales Komarekc9a3eb12016-10-12 11:17:55 +020035 metric_collector:
Ales Komarekf8d248e2016-10-21 10:27:28 +020036 filter:
37 nova_compute_filesystem_warning:
38 engine: afd_trigger
39 enabled: True # implicit
40 description: "The nova instance filesystem's root free space is low."
41 severity: warning
42 logical_operator: or # implicit
43 rules:
44 - metric: fs_space_percent_free
45 relational_operator: '<'
46 threshold: 10
47 window: 60
48 periods: 0
49 function: min
50 fs: '/var/lib/nova'
51 nova_compute_filesystem_critical:
52 engine: afd_trigger
53 enabled: True # implicit
54 description: "The nova instance filesystem's root free space is low."
55 severity: warning
56 logical_operator: or # implicit
57 rules:
58 - metric: fs_space_percent_free
59 relational_operator: '<'
60 threshold: 5
61 window: 60
62 periods: 0
63 function: min
64 fs: '/var/lib/nova'
65 nova_compute_service:
66 engine: afd_alarm
67 notifications: False
68 alerting: True
69 trigger:
70 vip:
71 - nova_compute_filesystem_warning
72 - nova_compute_filesystem_critical
73 - nova_compute_filesystem_critical
Ales Komarekc9a3eb12016-10-12 11:17:55 +020074
Ales Komarekf8d248e2016-10-21 10:27:28 +020075 heka:
76 aggregator:
77 filter:
78 nova_compute_service:
79 engine: gse_cluster
80 policy: highest_severity
81 group_by: member
82 members:
83 - vip
84 nova_compute: # the service_role format
85 engine: gse_alarm
86 policy: highest_severity
87 group_by: member
88 members:
89 - nova_compute_logs
90 - nova_compute_service
91 - nova_compute_instances
92 - nova_compute_libvirt
93 - nova_compute_free_cpu
94 - nova_compute_free_mem
95 hints:
96 - neutron_compute # or contrail_vrouter for contrail nodes
97
98Default CPU usage alarms
99
100.. code-block:: yaml
101
102 metric_collector:
103 filter:
104 linux_system_cpu_critical:
105 engine: afd_trigger
106 enabled: True # implicit
107 description: 'The CPU usage is too high.'
108 severity: critical
109 label:
110 hostname: '$match_by.hostname'
111 node_role: controller
112 match_by: ['hostname']
113 rules:
114 - metric: cpu_wait
115 relational_operator: >=
116 threshold: 35
117 window: 120
118 periods: 0
119 function: avg
120 - metric: cpu_idle
121 relational_operator: <=
122 threshold: 5
123 window: 120
124 function: avg
125 linux_system_cpu_warning:
126 engine: afd_trigger
127 enabled: True # implicit
128 description: 'The CPU wait times are high.'
129 severity: critical
130 label:
131 hostname: '$match_by.hostname'
132 node_role: controller
133 match_by: ['hostname']
134 rules:
135 - metric: cpu_wait
136 relational_operator: >=
137 threshold: 15
138 window: 120
139 periods: 0
140 function: avg
141 linux_system_cpu:
142 engine: afd_alarm
143 notifications: False
144 alerting: True
145 trigger:
146 vip:
147 - linux_system_cpu_warning # will not render if referenced trigger is disabled
148 - linux_system_cpu_critical
149
150CPU usage override for compute node
151
152.. code-block:: yaml
153
154 metric_collector:
155 filter:
156 nova_compute_cpu_critical:
157 engine: afd_trigger
158 enabled: True # implicit
159 description: 'The CPU wait times are too high.'
160 severity: critical
161 label:
162 hostname: '$match_by.hostname'
163 node_role: controller
164 match_by: ['hostname']
165 rules:
166 - metric: cpu_wait
167 relational_operator: >=
168 threshold: 35
169 window: 120
170 periods: 0
171 function: avg
172
173.. code-block:: yaml
174
175Alarm override option 1 - override:
176
177.. code-block:: yaml
178
179 metric_collector:
180 filter:
181 ...
182 # Trigger can be disable
183 linux_cpu_critical:
184 enabled: False
185 #Alarm can be overriden
186 linux_system_cpu:
187 trigger:
188 vip:
189 - nova_compute_cpu_critical
190
191Alarm override option 2 - reinitialize:
192
193.. code-block:: yaml
194
195 metric_collector:
196 filter:
197 ...
198 # Alarm is disabled
199 linux_system_cpu:
200 enabled: False
201 # new alarm is created
202 nova_compute_cpu:
203 engine: afd_alarm
204 notifications: False
205 alerting: True
206 trigger:
207 vip:
208 - linux_system_cpu_warning # will not render if referenced trigger is disabled
209 - nova_compute_cpu_critical
210
211
212Remote collector service
213------------------------
214
215Remote api check example
216
217.. code-block:: yaml
218
219 heka:
220 remote_collector:
221 filter:
222 nova_control_api_fail:
223 engine: afd_trigger
224 description: 'Endpoint check for nova-api failed.'
225 severity: critical
226 alerting: True
227 label:
228 hostname: '$match_by.hostname'
229 node_role: controller
230 match_by: ['hostname']
231 rules:
232 - metric: openstack_check_api
233 relational_operator: '=='
234 threshold: 0
235 window: 60
236 periods: 0
237 function: last
238 service: 'nova-api'
239 nova_control_api:
240 engine: afd_alarm
241 notifications: False
242 alerting: True
243 trigger:
244 vip:
245 - nova_control_api_fail
246
247Corresponding clusters and alarms
Ales Komarekc9a3eb12016-10-12 11:17:55 +0200248
249.. code-block:: yaml
250
251 heka:
252 aggregator:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200253 filter:
254 nova_control_service:
255 engine: gse_cluster
256 policy: highest_severity
257 group_by: member
258 members:
259 - backends
260 - http_errors
261 nova_control_api:
262 policy: highest_severity
263 group_by: member
264 members:
265 - vip
266 nova_control_endpoint:
267 policy: majority_of_members
268 group_by: hostname
269 members:
270 - endpoint
271
Ales Komarekc9a3eb12016-10-12 11:17:55 +0200272
Jakub Pavlike7d12cd2015-09-03 19:02:45 +0200273
274Read more
275=========
276
jan kaufman1002cd92015-09-16 16:30:48 +0200277* https://hekad.readthedocs.org/en/latest/index.html