blob: b3ad496036853513ccc6d81d657357c0e787564d [file] [log] [blame]
Jakub Pavlike7d12cd2015-09-03 19:02:45 +02001
2============
3Heka Formula
4============
5
Ales Komarekc9a3eb12016-10-12 11:17:55 +02006Heka is an open source stream processing software system developed by Mozilla. Heka is a Swiss Army Knife type tool for data processing.
Jakub Pavlike7d12cd2015-09-03 19:02:45 +02007
8Sample pillars
9==============
10
Ales Komarekc9a3eb12016-10-12 11:17:55 +020011Metric collector service
Ales Komarekf8d248e2016-10-21 10:27:28 +020012------------------------
13
Ales Komareke2b62602016-10-21 13:24:10 +020014Local alarm definition for nova compute role, excerpt from `nova/meta/heka.yml`.
jan kaufman1002cd92015-09-16 16:30:48 +020015
16.. code-block:: yaml
17
jan kaufman1002cd92015-09-16 16:30:48 +020018 heka:
Ales Komarekc9a3eb12016-10-12 11:17:55 +020019 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +020020 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +020021 nova_compute_filesystem_warning:
Ales Komarekf8d248e2016-10-21 10:27:28 +020022 enabled: True # implicit
23 description: "The nova instance filesystem's root free space is low."
24 severity: warning
25 logical_operator: or # implicit
26 rules:
27 - metric: fs_space_percent_free
28 relational_operator: '<'
29 threshold: 10
30 window: 60
31 periods: 0
32 function: min
Ales Komarek04a52952016-10-21 16:26:49 +020033 dimension:
34 fs: '/var/lib/nova'
Ales Komarekf8d248e2016-10-21 10:27:28 +020035 nova_compute_filesystem_critical:
Ales Komarekf8d248e2016-10-21 10:27:28 +020036 enabled: True # implicit
37 description: "The nova instance filesystem's root free space is low."
38 severity: warning
39 logical_operator: or # implicit
40 rules:
41 - metric: fs_space_percent_free
42 relational_operator: '<'
43 threshold: 5
44 window: 60
45 periods: 0
46 function: min
Ales Komarek04a52952016-10-21 16:26:49 +020047 dimension:
48 fs: '/var/lib/nova'
Ales Komareke2b62602016-10-21 13:24:10 +020049 filter:
Ales Komarekf8d248e2016-10-21 10:27:28 +020050 nova_compute_service:
Ales Komareke2b62602016-10-21 13:24:10 +020051 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +020052 notifications: False
53 alerting: True
Ales Komarek04a52952016-10-21 16:26:49 +020054 dimension:
55 hostname: '$match_by.hostname'
56 node_role: controller
57 match_by:
58 - hostname
59 triggers:
60 - nova_compute_filesystem_warning
61 - nova_compute_filesystem_critical
Ales Komareke2b62602016-10-21 13:24:10 +020062 aggregator:
63 filter:
Ales Komarek04a52952016-10-21 16:26:49 +020064 nova_compute_service: # the service_role format
Ales Komareke2b62602016-10-21 13:24:10 +020065 engine: gse
66 policy: highest_severity
67 group_by: member
Ales Komarek04a52952016-10-21 16:26:49 +020068 match:
69 node_role: compute
70 dimension:
71 cluster: nova-compute-plane
Ales Komareke2b62602016-10-21 13:24:10 +020072 members:
73 - nova_compute_logs
74 - nova_compute_service
75 - nova_compute_instances
76 - nova_compute_libvirt
77 - nova_compute_free_cpu
78 - nova_compute_free_mem
79 hints:
80 - neutron_compute # or contrail_vrouter for contrail nodes
Ales Komarek04a52952016-10-21 16:26:49 +020081 nova_compute_plane: # the service_role format
82 engine: gse
83 policy: highest_severity
84 group_by: member
85 match:
86 cluster: nova-compute-plane
87
Ales Komarekc9a3eb12016-10-12 11:17:55 +020088
Ales Komareke2b62602016-10-21 13:24:10 +020089Default CPU usage alarms, excerpt from `linux/meta/heka.yml`.
Ales Komarekf8d248e2016-10-21 10:27:28 +020090
91.. code-block:: yaml
92
93 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +020094 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +020095 linux_system_cpu_critical:
Ales Komarekf8d248e2016-10-21 10:27:28 +020096 enabled: True # implicit
97 description: 'The CPU usage is too high.'
98 severity: critical
Ales Komarekf8d248e2016-10-21 10:27:28 +020099 rules:
100 - metric: cpu_wait
101 relational_operator: >=
102 threshold: 35
103 window: 120
104 periods: 0
105 function: avg
106 - metric: cpu_idle
107 relational_operator: <=
108 threshold: 5
109 window: 120
110 function: avg
111 linux_system_cpu_warning:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200112 enabled: True # implicit
113 description: 'The CPU wait times are high.'
114 severity: critical
Ales Komarekf8d248e2016-10-21 10:27:28 +0200115 rules:
116 - metric: cpu_wait
117 relational_operator: >=
118 threshold: 15
119 window: 120
120 periods: 0
121 function: avg
Ales Komareke2b62602016-10-21 13:24:10 +0200122 filter:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200123 linux_system_cpu:
Ales Komareke2b62602016-10-21 13:24:10 +0200124 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +0200125 notifications: False
126 alerting: True
Ales Komarek04a52952016-10-21 16:26:49 +0200127 triggers:
128 - linux_system_cpu_warning # will not render if referenced trigger is disabled
129 - linux_system_cpu_critical
130 dimension:
131 hostname: '$match_by.hostname'
132 node_role: controller
133 match_by: ['hostname']
134
Ales Komarekf8d248e2016-10-21 10:27:28 +0200135
Ales Komareke2b62602016-10-21 13:24:10 +0200136CPU usage override for compute node, excerpt from `nova/meta/heka.yml`.
Ales Komarekf8d248e2016-10-21 10:27:28 +0200137
138.. code-block:: yaml
139
140 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +0200141 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200142 nova_compute_cpu_critical:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200143 enabled: True # implicit
144 description: 'The CPU wait times are too high.'
145 severity: critical
Ales Komarekf8d248e2016-10-21 10:27:28 +0200146 rules:
147 - metric: cpu_wait
148 relational_operator: >=
149 threshold: 35
150 window: 120
151 periods: 0
152 function: avg
153
154.. code-block:: yaml
155
156Alarm override option 1 - override:
157
158.. code-block:: yaml
159
160 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +0200161 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200162 # Trigger can be disable
Ales Komareke2b62602016-10-21 13:24:10 +0200163 linux_system_cpu_critical:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200164 enabled: False
Ales Komareke2b62602016-10-21 13:24:10 +0200165 filter:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200166 #Alarm can be overriden
167 linux_system_cpu:
168 trigger:
169 vip:
170 - nova_compute_cpu_critical
171
172Alarm override option 2 - reinitialize:
173
174.. code-block:: yaml
175
176 metric_collector:
177 filter:
178 ...
179 # Alarm is disabled
180 linux_system_cpu:
181 enabled: False
182 # new alarm is created
183 nova_compute_cpu:
184 engine: afd_alarm
185 notifications: False
186 alerting: True
Ales Komarek04a52952016-10-21 16:26:49 +0200187 triggers:
188 - linux_system_cpu_warning # will not render if referenced trigger is disabled
189 - nova_compute_cpu_critical
190 dimension:
191 hostname: '$match_by.hostname'
192 node_role: controller
193 match_by: ['hostname']
Ales Komarekf8d248e2016-10-21 10:27:28 +0200194
195
196Remote collector service
197------------------------
198
Ales Komareke2b62602016-10-21 13:24:10 +0200199Remote API check example, excerpt from `nova/meta/heka.yml`.
Ales Komarekf8d248e2016-10-21 10:27:28 +0200200
201.. code-block:: yaml
202
203 heka:
204 remote_collector:
Ales Komareke2b62602016-10-21 13:24:10 +0200205 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200206 nova_control_api_fail:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200207 description: 'Endpoint check for nova-api failed.'
208 severity: critical
Ales Komarekf8d248e2016-10-21 10:27:28 +0200209 rules:
210 - metric: openstack_check_api
211 relational_operator: '=='
212 threshold: 0
213 window: 60
214 periods: 0
215 function: last
216 service: 'nova-api'
Ales Komareke2b62602016-10-21 13:24:10 +0200217 filter:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200218 nova_control_api:
Ales Komareke2b62602016-10-21 13:24:10 +0200219 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +0200220 notifications: False
221 alerting: True
Ales Komarek04a52952016-10-21 16:26:49 +0200222 dimension:
223 hostname: '$match_by.hostname'
224 node_role: controller
225 match_by: ['hostname']
226 triggers:
227 - nova_control_api_fail
Ales Komarekf8d248e2016-10-21 10:27:28 +0200228
Ales Komareke2b62602016-10-21 13:24:10 +0200229Corresponding clusters and alarms, excerpt from `nova/meta/heka.yml`.
Ales Komarekc9a3eb12016-10-12 11:17:55 +0200230
231.. code-block:: yaml
232
233 heka:
234 aggregator:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200235 filter:
Ales Komarek04a52952016-10-21 16:26:49 +0200236 nova_control_service: # the service_role format
Ales Komareke2b62602016-10-21 13:24:10 +0200237 engine: gse
Ales Komarekf8d248e2016-10-21 10:27:28 +0200238 policy: highest_severity
239 group_by: member
Ales Komarek04a52952016-10-21 16:26:49 +0200240 match:
241 node_role: control
242 dimension:
243 cluster: nova-control-plane
Ales Komarekf8d248e2016-10-21 10:27:28 +0200244 members:
Ales Komareke2b62602016-10-21 13:24:10 +0200245 - nova_control_api
246 - nova_control_endpoint
247 hints:
248 - neutron_control # or contrail_vrouter for contrail nodes
249 - keystone_control
Ales Komarek04a52952016-10-21 16:26:49 +0200250 nova_control_plane: # the service_role format
251 engine: gse
252 policy: highest_severity
253 group_by: member
254 match:
255 cluster: nova-control-plane
256
Ales Komarekc9a3eb12016-10-12 11:17:55 +0200257
Jakub Pavlike7d12cd2015-09-03 19:02:45 +0200258
259Read more
260=========
261
jan kaufman1002cd92015-09-16 16:30:48 +0200262* https://hekad.readthedocs.org/en/latest/index.html