blob: fcd2666a4d2b4848e8590adcc481028669efea7c [file] [log] [blame]
Jakub Pavlike7d12cd2015-09-03 19:02:45 +02001
2============
3Heka Formula
4============
5
Ales Komarekc9a3eb12016-10-12 11:17:55 +02006Heka is an open source stream processing software system developed by Mozilla. Heka is a Swiss Army Knife type tool for data processing.
Jakub Pavlike7d12cd2015-09-03 19:02:45 +02007
8Sample pillars
9==============
10
Ales Komarekc9a3eb12016-10-12 11:17:55 +020011Metric collector service
Ales Komarekf8d248e2016-10-21 10:27:28 +020012------------------------
13
Ales Komareke2b62602016-10-21 13:24:10 +020014Local alarm definition for nova compute role, excerpt from `nova/meta/heka.yml`.
jan kaufman1002cd92015-09-16 16:30:48 +020015
16.. code-block:: yaml
17
jan kaufman1002cd92015-09-16 16:30:48 +020018 heka:
Ales Komarekc9a3eb12016-10-12 11:17:55 +020019 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +020020 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +020021 nova_compute_filesystem_warning:
Ales Komarekf8d248e2016-10-21 10:27:28 +020022 enabled: True # implicit
23 description: "The nova instance filesystem's root free space is low."
24 severity: warning
25 logical_operator: or # implicit
26 rules:
27 - metric: fs_space_percent_free
28 relational_operator: '<'
29 threshold: 10
30 window: 60
31 periods: 0
32 function: min
Ales Komarek04a52952016-10-21 16:26:49 +020033 dimension:
34 fs: '/var/lib/nova'
Ales Komarekf8d248e2016-10-21 10:27:28 +020035 nova_compute_filesystem_critical:
Ales Komarekf8d248e2016-10-21 10:27:28 +020036 description: "The nova instance filesystem's root free space is low."
37 severity: warning
Ales Komarekf8d248e2016-10-21 10:27:28 +020038 rules:
39 - metric: fs_space_percent_free
40 relational_operator: '<'
41 threshold: 5
42 window: 60
43 periods: 0
44 function: min
Ales Komarek04a52952016-10-21 16:26:49 +020045 dimension:
46 fs: '/var/lib/nova'
Ales Komarek9a8bd082016-10-25 01:25:09 +020047 alarm:
48 nova_compute_filesystem:
Ales Komarekf8d248e2016-10-21 10:27:28 +020049 notifications: False
50 alerting: True
Ales Komarek04a52952016-10-21 16:26:49 +020051 dimension:
Ales Komarek04a52952016-10-21 16:26:49 +020052 node_role: controller
Ales Komarek04a52952016-10-21 16:26:49 +020053 triggers:
54 - nova_compute_filesystem_warning
55 - nova_compute_filesystem_critical
Ales Komareke2b62602016-10-21 13:24:10 +020056 aggregator:
Ales Komarek9a8bd082016-10-25 01:25:09 +020057 alarm_cluster:
Ales Komarek04a52952016-10-21 16:26:49 +020058 nova_compute_service: # the service_role format
Ales Komareke2b62602016-10-21 13:24:10 +020059 policy: highest_severity
60 group_by: member
Ales Komarek04a52952016-10-21 16:26:49 +020061 match:
62 node_role: compute
63 dimension:
64 cluster: nova-compute-plane
Ales Komareke2b62602016-10-21 13:24:10 +020065 members:
66 - nova_compute_logs
Ales Komarek9a8bd082016-10-25 01:25:09 +020067 - nova_compute_filesystem
Ales Komareke2b62602016-10-21 13:24:10 +020068 - nova_compute_instances
69 - nova_compute_libvirt
70 - nova_compute_free_cpu
71 - nova_compute_free_mem
72 hints:
73 - neutron_compute # or contrail_vrouter for contrail nodes
Ales Komarek04a52952016-10-21 16:26:49 +020074 nova_compute_plane: # the service_role format
75 engine: gse
76 policy: highest_severity
77 group_by: member
78 match:
79 cluster: nova-compute-plane
80
Ales Komareke2b62602016-10-21 13:24:10 +020081Default CPU usage alarms, excerpt from `linux/meta/heka.yml`.
Ales Komarekf8d248e2016-10-21 10:27:28 +020082
83.. code-block:: yaml
84
85 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +020086 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +020087 linux_system_cpu_critical:
Ales Komarekf8d248e2016-10-21 10:27:28 +020088 description: 'The CPU usage is too high.'
89 severity: critical
Ales Komarekf8d248e2016-10-21 10:27:28 +020090 rules:
91 - metric: cpu_wait
92 relational_operator: >=
93 threshold: 35
94 window: 120
95 periods: 0
96 function: avg
97 - metric: cpu_idle
98 relational_operator: <=
99 threshold: 5
100 window: 120
101 function: avg
102 linux_system_cpu_warning:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200103 description: 'The CPU wait times are high.'
104 severity: critical
Ales Komarekf8d248e2016-10-21 10:27:28 +0200105 rules:
106 - metric: cpu_wait
107 relational_operator: >=
108 threshold: 15
109 window: 120
110 periods: 0
111 function: avg
Ales Komarek9a8bd082016-10-25 01:25:09 +0200112 alarm:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200113 linux_system_cpu:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200114 notifications: False
115 alerting: True
Ales Komarek04a52952016-10-21 16:26:49 +0200116 triggers:
117 - linux_system_cpu_warning # will not render if referenced trigger is disabled
118 - linux_system_cpu_critical
119 dimension:
Ales Komarek04a52952016-10-21 16:26:49 +0200120 node_role: controller
Ales Komarek04a52952016-10-21 16:26:49 +0200121
Ales Komarekf8d248e2016-10-21 10:27:28 +0200122
Ales Komareke2b62602016-10-21 13:24:10 +0200123CPU usage override for compute node, excerpt from `nova/meta/heka.yml`.
Ales Komarekf8d248e2016-10-21 10:27:28 +0200124
125.. code-block:: yaml
126
127 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +0200128 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200129 nova_compute_cpu_critical:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200130 description: 'The CPU wait times are too high.'
131 severity: critical
Ales Komarekf8d248e2016-10-21 10:27:28 +0200132 rules:
133 - metric: cpu_wait
134 relational_operator: >=
135 threshold: 35
136 window: 120
137 periods: 0
138 function: avg
139
140.. code-block:: yaml
141
142Alarm override option 1 - override:
143
144.. code-block:: yaml
145
146 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +0200147 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200148 # Trigger can be disable
Ales Komareke2b62602016-10-21 13:24:10 +0200149 linux_system_cpu_critical:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200150 enabled: False
Ales Komarek9a8bd082016-10-25 01:25:09 +0200151 alarm:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200152 #Alarm can be overriden
153 linux_system_cpu:
Ales Komarek00ef62b2016-10-21 17:18:05 +0200154 triggers:
155 - nova_compute_cpu_critical
Ales Komarekf8d248e2016-10-21 10:27:28 +0200156
157Alarm override option 2 - reinitialize:
158
159.. code-block:: yaml
160
161 metric_collector:
Ales Komarek9a8bd082016-10-25 01:25:09 +0200162 alarm:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200163 ...
164 # Alarm is disabled
165 linux_system_cpu:
166 enabled: False
167 # new alarm is created
168 nova_compute_cpu:
Ales Komarek00ef62b2016-10-21 17:18:05 +0200169 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +0200170 notifications: False
171 alerting: True
Ales Komarek04a52952016-10-21 16:26:49 +0200172 triggers:
173 - linux_system_cpu_warning # will not render if referenced trigger is disabled
174 - nova_compute_cpu_critical
175 dimension:
Ales Komarek04a52952016-10-21 16:26:49 +0200176 node_role: controller
Ales Komarekf8d248e2016-10-21 10:27:28 +0200177
178
179Remote collector service
180------------------------
181
Ales Komareke2b62602016-10-21 13:24:10 +0200182Remote API check example, excerpt from `nova/meta/heka.yml`.
Ales Komarekf8d248e2016-10-21 10:27:28 +0200183
184.. code-block:: yaml
185
186 heka:
187 remote_collector:
Ales Komareke2b62602016-10-21 13:24:10 +0200188 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200189 nova_control_api_fail:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200190 description: 'Endpoint check for nova-api failed.'
191 severity: critical
Ales Komarekf8d248e2016-10-21 10:27:28 +0200192 rules:
193 - metric: openstack_check_api
194 relational_operator: '=='
195 threshold: 0
196 window: 60
197 periods: 0
198 function: last
Ales Komarek00ef62b2016-10-21 17:18:05 +0200199 dimension:
200 service: 'nova-api'
Ales Komarek9a8bd082016-10-25 01:25:09 +0200201 alarm:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200202 nova_control_api:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200203 notifications: False
204 alerting: True
Ales Komarek04a52952016-10-21 16:26:49 +0200205 dimension:
Ales Komarek04a52952016-10-21 16:26:49 +0200206 node_role: controller
Ales Komarek04a52952016-10-21 16:26:49 +0200207 triggers:
208 - nova_control_api_fail
Ales Komarekf8d248e2016-10-21 10:27:28 +0200209
Ales Komareke2b62602016-10-21 13:24:10 +0200210Corresponding clusters and alarms, excerpt from `nova/meta/heka.yml`.
Ales Komarekc9a3eb12016-10-12 11:17:55 +0200211
212.. code-block:: yaml
213
214 heka:
215 aggregator:
Ales Komarek9a8bd082016-10-25 01:25:09 +0200216 alarm_cluster:
Ales Komarek00ef62b2016-10-21 17:18:05 +0200217 nova_control_service:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200218 policy: highest_severity
219 group_by: member
Ales Komarek04a52952016-10-21 16:26:49 +0200220 match:
221 node_role: control
222 dimension:
Ales Komarek00ef62b2016-10-21 17:18:05 +0200223 cluster: openstack-control-plane
Ales Komarekf8d248e2016-10-21 10:27:28 +0200224 members:
Ales Komareke2b62602016-10-21 13:24:10 +0200225 - nova_control_api
226 - nova_control_endpoint
227 hints:
228 - neutron_control # or contrail_vrouter for contrail nodes
229 - keystone_control
Ales Komarek00ef62b2016-10-21 17:18:05 +0200230 openstack_control_plane:
Ales Komarek04a52952016-10-21 16:26:49 +0200231 engine: gse
232 policy: highest_severity
233 group_by: member
234 match:
Ales Komarek00ef62b2016-10-21 17:18:05 +0200235 cluster: openstack-control-plane
Jakub Pavlike7d12cd2015-09-03 19:02:45 +0200236
237Read more
238=========
239
jan kaufman1002cd92015-09-16 16:30:48 +0200240* https://hekad.readthedocs.org/en/latest/index.html