blob: 51d7efb37eb72871866aab98162d2521020c6ea2 [file] [log] [blame]
Jakub Pavlike7d12cd2015-09-03 19:02:45 +02001
2============
3Heka Formula
4============
5
Ales Komarekc9a3eb12016-10-12 11:17:55 +02006Heka is an open source stream processing software system developed by Mozilla. Heka is a Swiss Army Knife type tool for data processing.
Jakub Pavlike7d12cd2015-09-03 19:02:45 +02007
8Sample pillars
9==============
10
Ales Komarekc9a3eb12016-10-12 11:17:55 +020011Metric collector service
Ales Komarekf8d248e2016-10-21 10:27:28 +020012------------------------
13
Ales Komareke2b62602016-10-21 13:24:10 +020014Local alarm definition for nova compute role, excerpt from `nova/meta/heka.yml`.
jan kaufman1002cd92015-09-16 16:30:48 +020015
16.. code-block:: yaml
17
jan kaufman1002cd92015-09-16 16:30:48 +020018 heka:
Ales Komarekc9a3eb12016-10-12 11:17:55 +020019 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +020020 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +020021 nova_compute_filesystem_warning:
Ales Komarekf8d248e2016-10-21 10:27:28 +020022 enabled: True # implicit
23 description: "The nova instance filesystem's root free space is low."
24 severity: warning
25 logical_operator: or # implicit
26 rules:
27 - metric: fs_space_percent_free
28 relational_operator: '<'
29 threshold: 10
30 window: 60
31 periods: 0
32 function: min
Ales Komarek04a52952016-10-21 16:26:49 +020033 dimension:
34 fs: '/var/lib/nova'
Ales Komarekf8d248e2016-10-21 10:27:28 +020035 nova_compute_filesystem_critical:
Ales Komarekf8d248e2016-10-21 10:27:28 +020036 description: "The nova instance filesystem's root free space is low."
37 severity: warning
Ales Komarekf8d248e2016-10-21 10:27:28 +020038 rules:
39 - metric: fs_space_percent_free
40 relational_operator: '<'
41 threshold: 5
42 window: 60
43 periods: 0
44 function: min
Ales Komarek04a52952016-10-21 16:26:49 +020045 dimension:
46 fs: '/var/lib/nova'
Ales Komareke2b62602016-10-21 13:24:10 +020047 filter:
Ales Komarekf8d248e2016-10-21 10:27:28 +020048 nova_compute_service:
Ales Komareke2b62602016-10-21 13:24:10 +020049 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +020050 notifications: False
51 alerting: True
Ales Komarek04a52952016-10-21 16:26:49 +020052 dimension:
53 hostname: '$match_by.hostname'
54 node_role: controller
55 match_by:
56 - hostname
57 triggers:
58 - nova_compute_filesystem_warning
59 - nova_compute_filesystem_critical
Ales Komareke2b62602016-10-21 13:24:10 +020060 aggregator:
61 filter:
Ales Komarek04a52952016-10-21 16:26:49 +020062 nova_compute_service: # the service_role format
Ales Komareke2b62602016-10-21 13:24:10 +020063 engine: gse
64 policy: highest_severity
65 group_by: member
Ales Komarek04a52952016-10-21 16:26:49 +020066 match:
67 node_role: compute
68 dimension:
69 cluster: nova-compute-plane
Ales Komareke2b62602016-10-21 13:24:10 +020070 members:
71 - nova_compute_logs
72 - nova_compute_service
73 - nova_compute_instances
74 - nova_compute_libvirt
75 - nova_compute_free_cpu
76 - nova_compute_free_mem
77 hints:
78 - neutron_compute # or contrail_vrouter for contrail nodes
Ales Komarek04a52952016-10-21 16:26:49 +020079 nova_compute_plane: # the service_role format
80 engine: gse
81 policy: highest_severity
82 group_by: member
83 match:
84 cluster: nova-compute-plane
85
Ales Komareke2b62602016-10-21 13:24:10 +020086Default CPU usage alarms, excerpt from `linux/meta/heka.yml`.
Ales Komarekf8d248e2016-10-21 10:27:28 +020087
88.. code-block:: yaml
89
90 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +020091 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +020092 linux_system_cpu_critical:
Ales Komarekf8d248e2016-10-21 10:27:28 +020093 description: 'The CPU usage is too high.'
94 severity: critical
Ales Komarekf8d248e2016-10-21 10:27:28 +020095 rules:
96 - metric: cpu_wait
97 relational_operator: >=
98 threshold: 35
99 window: 120
100 periods: 0
101 function: avg
102 - metric: cpu_idle
103 relational_operator: <=
104 threshold: 5
105 window: 120
106 function: avg
107 linux_system_cpu_warning:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200108 description: 'The CPU wait times are high.'
109 severity: critical
Ales Komarekf8d248e2016-10-21 10:27:28 +0200110 rules:
111 - metric: cpu_wait
112 relational_operator: >=
113 threshold: 15
114 window: 120
115 periods: 0
116 function: avg
Ales Komareke2b62602016-10-21 13:24:10 +0200117 filter:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200118 linux_system_cpu:
Ales Komareke2b62602016-10-21 13:24:10 +0200119 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +0200120 notifications: False
121 alerting: True
Ales Komarek04a52952016-10-21 16:26:49 +0200122 triggers:
123 - linux_system_cpu_warning # will not render if referenced trigger is disabled
124 - linux_system_cpu_critical
125 dimension:
126 hostname: '$match_by.hostname'
127 node_role: controller
128 match_by: ['hostname']
129
Ales Komarekf8d248e2016-10-21 10:27:28 +0200130
Ales Komareke2b62602016-10-21 13:24:10 +0200131CPU usage override for compute node, excerpt from `nova/meta/heka.yml`.
Ales Komarekf8d248e2016-10-21 10:27:28 +0200132
133.. code-block:: yaml
134
135 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +0200136 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200137 nova_compute_cpu_critical:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200138 description: 'The CPU wait times are too high.'
139 severity: critical
Ales Komarekf8d248e2016-10-21 10:27:28 +0200140 rules:
141 - metric: cpu_wait
142 relational_operator: >=
143 threshold: 35
144 window: 120
145 periods: 0
146 function: avg
147
148.. code-block:: yaml
149
150Alarm override option 1 - override:
151
152.. code-block:: yaml
153
154 metric_collector:
Ales Komareke2b62602016-10-21 13:24:10 +0200155 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200156 # Trigger can be disable
Ales Komareke2b62602016-10-21 13:24:10 +0200157 linux_system_cpu_critical:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200158 enabled: False
Ales Komareke2b62602016-10-21 13:24:10 +0200159 filter:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200160 #Alarm can be overriden
161 linux_system_cpu:
Ales Komarek00ef62b2016-10-21 17:18:05 +0200162 triggers:
163 - nova_compute_cpu_critical
Ales Komarekf8d248e2016-10-21 10:27:28 +0200164
165Alarm override option 2 - reinitialize:
166
167.. code-block:: yaml
168
169 metric_collector:
170 filter:
171 ...
172 # Alarm is disabled
173 linux_system_cpu:
174 enabled: False
175 # new alarm is created
176 nova_compute_cpu:
Ales Komarek00ef62b2016-10-21 17:18:05 +0200177 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +0200178 notifications: False
179 alerting: True
Ales Komarek04a52952016-10-21 16:26:49 +0200180 triggers:
181 - linux_system_cpu_warning # will not render if referenced trigger is disabled
182 - nova_compute_cpu_critical
183 dimension:
184 hostname: '$match_by.hostname'
185 node_role: controller
186 match_by: ['hostname']
Ales Komarekf8d248e2016-10-21 10:27:28 +0200187
188
189Remote collector service
190------------------------
191
Ales Komareke2b62602016-10-21 13:24:10 +0200192Remote API check example, excerpt from `nova/meta/heka.yml`.
Ales Komarekf8d248e2016-10-21 10:27:28 +0200193
194.. code-block:: yaml
195
196 heka:
197 remote_collector:
Ales Komareke2b62602016-10-21 13:24:10 +0200198 trigger:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200199 nova_control_api_fail:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200200 description: 'Endpoint check for nova-api failed.'
201 severity: critical
Ales Komarekf8d248e2016-10-21 10:27:28 +0200202 rules:
203 - metric: openstack_check_api
204 relational_operator: '=='
205 threshold: 0
206 window: 60
207 periods: 0
208 function: last
Ales Komarek00ef62b2016-10-21 17:18:05 +0200209 dimension:
210 service: 'nova-api'
Ales Komareke2b62602016-10-21 13:24:10 +0200211 filter:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200212 nova_control_api:
Ales Komareke2b62602016-10-21 13:24:10 +0200213 engine: afd
Ales Komarekf8d248e2016-10-21 10:27:28 +0200214 notifications: False
215 alerting: True
Ales Komarek04a52952016-10-21 16:26:49 +0200216 dimension:
217 hostname: '$match_by.hostname'
218 node_role: controller
219 match_by: ['hostname']
220 triggers:
221 - nova_control_api_fail
Ales Komarekf8d248e2016-10-21 10:27:28 +0200222
Ales Komareke2b62602016-10-21 13:24:10 +0200223Corresponding clusters and alarms, excerpt from `nova/meta/heka.yml`.
Ales Komarekc9a3eb12016-10-12 11:17:55 +0200224
225.. code-block:: yaml
226
227 heka:
228 aggregator:
Ales Komarekf8d248e2016-10-21 10:27:28 +0200229 filter:
Ales Komarek00ef62b2016-10-21 17:18:05 +0200230 nova_control_service:
Ales Komareke2b62602016-10-21 13:24:10 +0200231 engine: gse
Ales Komarekf8d248e2016-10-21 10:27:28 +0200232 policy: highest_severity
233 group_by: member
Ales Komarek04a52952016-10-21 16:26:49 +0200234 match:
235 node_role: control
236 dimension:
Ales Komarek00ef62b2016-10-21 17:18:05 +0200237 cluster: openstack-control-plane
Ales Komarekf8d248e2016-10-21 10:27:28 +0200238 members:
Ales Komareke2b62602016-10-21 13:24:10 +0200239 - nova_control_api
240 - nova_control_endpoint
241 hints:
242 - neutron_control # or contrail_vrouter for contrail nodes
243 - keystone_control
Ales Komarek00ef62b2016-10-21 17:18:05 +0200244 openstack_control_plane:
Ales Komarek04a52952016-10-21 16:26:49 +0200245 engine: gse
246 policy: highest_severity
247 group_by: member
248 match:
Ales Komarek00ef62b2016-10-21 17:18:05 +0200249 cluster: openstack-control-plane
Jakub Pavlike7d12cd2015-09-03 19:02:45 +0200250
251Read more
252=========
253
jan kaufman1002cd92015-09-16 16:30:48 +0200254* https://hekad.readthedocs.org/en/latest/index.html