Jakub Pavlik | e7d12cd | 2015-09-03 19:02:45 +0200 | [diff] [blame] | 1 | |
| 2 | ============ |
| 3 | Heka Formula |
| 4 | ============ |
| 5 | |
Ales Komarek | c9a3eb1 | 2016-10-12 11:17:55 +0200 | [diff] [blame] | 6 | Heka is an open source stream processing software system developed by Mozilla. Heka is a Swiss Army Knife type tool for data processing. |
Jakub Pavlik | e7d12cd | 2015-09-03 19:02:45 +0200 | [diff] [blame] | 7 | |
| 8 | Sample pillars |
| 9 | ============== |
| 10 | |
Ales Komarek | c9a3eb1 | 2016-10-12 11:17:55 +0200 | [diff] [blame] | 11 | Log collector service |
jan kaufman | 1002cd9 | 2015-09-16 16:30:48 +0200 | [diff] [blame] | 12 | |
Jakub Pavlik | e7d12cd | 2015-09-03 19:02:45 +0200 | [diff] [blame] | 13 | .. code-block:: yaml |
| 14 | |
| 15 | heka: |
Ales Komarek | c9a3eb1 | 2016-10-12 11:17:55 +0200 | [diff] [blame] | 16 | log_collector: |
Jakub Pavlik | e7d12cd | 2015-09-03 19:02:45 +0200 | [diff] [blame] | 17 | enabled: true |
Jakub Pavlik | 6014f91 | 2015-09-03 19:05:05 +0200 | [diff] [blame] | 18 | output: |
Ales Komarek | c9a3eb1 | 2016-10-12 11:17:55 +0200 | [diff] [blame] | 19 | elasticsearch01: |
| 20 | engine: elasticsearch |
Jakub Pavlik | 6014f91 | 2015-09-03 19:05:05 +0200 | [diff] [blame] | 21 | host: localhost |
Ales Komarek | c9a3eb1 | 2016-10-12 11:17:55 +0200 | [diff] [blame] | 22 | port: 9200 |
| 23 | encoder: es_json |
jan kaufman | 1002cd9 | 2015-09-16 16:30:48 +0200 | [diff] [blame] | 24 | message_matcher: TRUE |
jan kaufman | 1002cd9 | 2015-09-16 16:30:48 +0200 | [diff] [blame] | 25 | |
Ales Komarek | f8d248e | 2016-10-21 10:27:28 +0200 | [diff] [blame] | 26 | |
Ales Komarek | c9a3eb1 | 2016-10-12 11:17:55 +0200 | [diff] [blame] | 27 | Metric collector service |
Ales Komarek | f8d248e | 2016-10-21 10:27:28 +0200 | [diff] [blame] | 28 | ------------------------ |
| 29 | |
| 30 | Local alarm definition for nova compute role |
jan kaufman | 1002cd9 | 2015-09-16 16:30:48 +0200 | [diff] [blame] | 31 | |
| 32 | .. code-block:: yaml |
| 33 | |
jan kaufman | 1002cd9 | 2015-09-16 16:30:48 +0200 | [diff] [blame] | 34 | heka: |
Ales Komarek | c9a3eb1 | 2016-10-12 11:17:55 +0200 | [diff] [blame] | 35 | metric_collector: |
Ales Komarek | f8d248e | 2016-10-21 10:27:28 +0200 | [diff] [blame] | 36 | filter: |
| 37 | nova_compute_filesystem_warning: |
| 38 | engine: afd_trigger |
| 39 | enabled: True # implicit |
| 40 | description: "The nova instance filesystem's root free space is low." |
| 41 | severity: warning |
| 42 | logical_operator: or # implicit |
| 43 | rules: |
| 44 | - metric: fs_space_percent_free |
| 45 | relational_operator: '<' |
| 46 | threshold: 10 |
| 47 | window: 60 |
| 48 | periods: 0 |
| 49 | function: min |
| 50 | fs: '/var/lib/nova' |
| 51 | nova_compute_filesystem_critical: |
| 52 | engine: afd_trigger |
| 53 | enabled: True # implicit |
| 54 | description: "The nova instance filesystem's root free space is low." |
| 55 | severity: warning |
| 56 | logical_operator: or # implicit |
| 57 | rules: |
| 58 | - metric: fs_space_percent_free |
| 59 | relational_operator: '<' |
| 60 | threshold: 5 |
| 61 | window: 60 |
| 62 | periods: 0 |
| 63 | function: min |
| 64 | fs: '/var/lib/nova' |
| 65 | nova_compute_service: |
| 66 | engine: afd_alarm |
| 67 | notifications: False |
| 68 | alerting: True |
| 69 | trigger: |
| 70 | vip: |
| 71 | - nova_compute_filesystem_warning |
| 72 | - nova_compute_filesystem_critical |
| 73 | - nova_compute_filesystem_critical |
Ales Komarek | c9a3eb1 | 2016-10-12 11:17:55 +0200 | [diff] [blame] | 74 | |
Ales Komarek | f8d248e | 2016-10-21 10:27:28 +0200 | [diff] [blame] | 75 | heka: |
| 76 | aggregator: |
| 77 | filter: |
| 78 | nova_compute_service: |
| 79 | engine: gse_cluster |
| 80 | policy: highest_severity |
| 81 | group_by: member |
| 82 | members: |
| 83 | - vip |
| 84 | nova_compute: # the service_role format |
| 85 | engine: gse_alarm |
| 86 | policy: highest_severity |
| 87 | group_by: member |
| 88 | members: |
| 89 | - nova_compute_logs |
| 90 | - nova_compute_service |
| 91 | - nova_compute_instances |
| 92 | - nova_compute_libvirt |
| 93 | - nova_compute_free_cpu |
| 94 | - nova_compute_free_mem |
| 95 | hints: |
| 96 | - neutron_compute # or contrail_vrouter for contrail nodes |
| 97 | |
| 98 | Default CPU usage alarms |
| 99 | |
| 100 | .. code-block:: yaml |
| 101 | |
| 102 | metric_collector: |
| 103 | filter: |
| 104 | linux_system_cpu_critical: |
| 105 | engine: afd_trigger |
| 106 | enabled: True # implicit |
| 107 | description: 'The CPU usage is too high.' |
| 108 | severity: critical |
| 109 | label: |
| 110 | hostname: '$match_by.hostname' |
| 111 | node_role: controller |
| 112 | match_by: ['hostname'] |
| 113 | rules: |
| 114 | - metric: cpu_wait |
| 115 | relational_operator: >= |
| 116 | threshold: 35 |
| 117 | window: 120 |
| 118 | periods: 0 |
| 119 | function: avg |
| 120 | - metric: cpu_idle |
| 121 | relational_operator: <= |
| 122 | threshold: 5 |
| 123 | window: 120 |
| 124 | function: avg |
| 125 | linux_system_cpu_warning: |
| 126 | engine: afd_trigger |
| 127 | enabled: True # implicit |
| 128 | description: 'The CPU wait times are high.' |
| 129 | severity: critical |
| 130 | label: |
| 131 | hostname: '$match_by.hostname' |
| 132 | node_role: controller |
| 133 | match_by: ['hostname'] |
| 134 | rules: |
| 135 | - metric: cpu_wait |
| 136 | relational_operator: >= |
| 137 | threshold: 15 |
| 138 | window: 120 |
| 139 | periods: 0 |
| 140 | function: avg |
| 141 | linux_system_cpu: |
| 142 | engine: afd_alarm |
| 143 | notifications: False |
| 144 | alerting: True |
| 145 | trigger: |
| 146 | vip: |
| 147 | - linux_system_cpu_warning # will not render if referenced trigger is disabled |
| 148 | - linux_system_cpu_critical |
| 149 | |
| 150 | CPU usage override for compute node |
| 151 | |
| 152 | .. code-block:: yaml |
| 153 | |
| 154 | metric_collector: |
| 155 | filter: |
| 156 | nova_compute_cpu_critical: |
| 157 | engine: afd_trigger |
| 158 | enabled: True # implicit |
| 159 | description: 'The CPU wait times are too high.' |
| 160 | severity: critical |
| 161 | label: |
| 162 | hostname: '$match_by.hostname' |
| 163 | node_role: controller |
| 164 | match_by: ['hostname'] |
| 165 | rules: |
| 166 | - metric: cpu_wait |
| 167 | relational_operator: >= |
| 168 | threshold: 35 |
| 169 | window: 120 |
| 170 | periods: 0 |
| 171 | function: avg |
| 172 | |
| 173 | .. code-block:: yaml |
| 174 | |
| 175 | Alarm override option 1 - override: |
| 176 | |
| 177 | .. code-block:: yaml |
| 178 | |
| 179 | metric_collector: |
| 180 | filter: |
| 181 | ... |
| 182 | # Trigger can be disable |
| 183 | linux_cpu_critical: |
| 184 | enabled: False |
| 185 | #Alarm can be overriden |
| 186 | linux_system_cpu: |
| 187 | trigger: |
| 188 | vip: |
| 189 | - nova_compute_cpu_critical |
| 190 | |
| 191 | Alarm override option 2 - reinitialize: |
| 192 | |
| 193 | .. code-block:: yaml |
| 194 | |
| 195 | metric_collector: |
| 196 | filter: |
| 197 | ... |
| 198 | # Alarm is disabled |
| 199 | linux_system_cpu: |
| 200 | enabled: False |
| 201 | # new alarm is created |
| 202 | nova_compute_cpu: |
| 203 | engine: afd_alarm |
| 204 | notifications: False |
| 205 | alerting: True |
| 206 | trigger: |
| 207 | vip: |
| 208 | - linux_system_cpu_warning # will not render if referenced trigger is disabled |
| 209 | - nova_compute_cpu_critical |
| 210 | |
| 211 | |
| 212 | Remote collector service |
| 213 | ------------------------ |
| 214 | |
| 215 | Remote api check example |
| 216 | |
| 217 | .. code-block:: yaml |
| 218 | |
| 219 | heka: |
| 220 | remote_collector: |
| 221 | filter: |
| 222 | nova_control_api_fail: |
| 223 | engine: afd_trigger |
| 224 | description: 'Endpoint check for nova-api failed.' |
| 225 | severity: critical |
| 226 | alerting: True |
| 227 | label: |
| 228 | hostname: '$match_by.hostname' |
| 229 | node_role: controller |
| 230 | match_by: ['hostname'] |
| 231 | rules: |
| 232 | - metric: openstack_check_api |
| 233 | relational_operator: '==' |
| 234 | threshold: 0 |
| 235 | window: 60 |
| 236 | periods: 0 |
| 237 | function: last |
| 238 | service: 'nova-api' |
| 239 | nova_control_api: |
| 240 | engine: afd_alarm |
| 241 | notifications: False |
| 242 | alerting: True |
| 243 | trigger: |
| 244 | vip: |
| 245 | - nova_control_api_fail |
| 246 | |
| 247 | Corresponding clusters and alarms |
Ales Komarek | c9a3eb1 | 2016-10-12 11:17:55 +0200 | [diff] [blame] | 248 | |
| 249 | .. code-block:: yaml |
| 250 | |
| 251 | heka: |
| 252 | aggregator: |
Ales Komarek | f8d248e | 2016-10-21 10:27:28 +0200 | [diff] [blame] | 253 | filter: |
| 254 | nova_control_service: |
| 255 | engine: gse_cluster |
| 256 | policy: highest_severity |
| 257 | group_by: member |
| 258 | members: |
| 259 | - backends |
| 260 | - http_errors |
| 261 | nova_control_api: |
| 262 | policy: highest_severity |
| 263 | group_by: member |
| 264 | members: |
| 265 | - vip |
| 266 | nova_control_endpoint: |
| 267 | policy: majority_of_members |
| 268 | group_by: hostname |
| 269 | members: |
| 270 | - endpoint |
| 271 | |
Ales Komarek | c9a3eb1 | 2016-10-12 11:17:55 +0200 | [diff] [blame] | 272 | |
Jakub Pavlik | e7d12cd | 2015-09-03 19:02:45 +0200 | [diff] [blame] | 273 | |
| 274 | Read more |
| 275 | ========= |
| 276 | |
jan kaufman | 1002cd9 | 2015-09-16 16:30:48 +0200 | [diff] [blame] | 277 | * https://hekad.readthedocs.org/en/latest/index.html |