blob: 35c877a6b831543ffc40e6a7e5e4fa73b3e914f6 [file] [log] [blame]
Éric Lemoine71272712016-11-08 12:53:51 +00001-- Copyright 2015 Mirantis, Inc.
2--
3-- Licensed under the Apache License, Version 2.0 (the "License");
4-- you may not use this file except in compliance with the License.
5-- You may obtain a copy of the License at
6--
7-- http://www.apache.org/licenses/LICENSE-2.0
8--
9-- Unless required by applicable law or agreed to in writing, software
10-- distributed under the License is distributed on an "AS IS" BASIS,
11-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12-- See the License for the specific language governing permissions and
13-- limitations under the License.
14
15EXPORT_ASSERT_TO_GLOBALS=true
16require('luaunit')
17package.path = package.path .. ";../heka/files/lua/common/?.lua;lua/mocks/?.lua"
18local lma_alarm = require('afd_alarms')
19local consts = require('gse_constants')
20
21local alarms = {
22 { -- 1
23 name = 'FS_all_no_field',
24 description = 'FS all no field',
25 enabled = true,
26 trigger = {
27 rules = {
28 {
29 metric = 'fs_space_percent_free',
30 window = 120,
31 ['function'] = 'avg',
32 relational_operator = '<=',
33 threshold = 11,
34 },
35 },
36 logical_operator = 'and',
37 },
38 severity = 'warning',
39 },
40 { -- 2
41 name = 'RabbitMQ_Critical',
42 description = 'Number of messages in queue is critical',
43 enabled = true,
44 trigger = {
45 rules = {
46 {
47 relational_operator = '>=',
48 metric = 'rabbitmq_messages',
49 fields = {},
50 window = "300",
51 periods = "0",
52 ['function'] = 'min',
53 threshold = "50",
54 },
55 },
56 logical_operator = 'or',
57 },
58 severity = 'critical',
59 },
60 { -- 3
61 name = 'CPU_Critical_Controller',
62 description = 'CPU is critical for the controller',
63 enabled = true,
64 trigger = {
65 rules = {
66 {
67 metric = 'cpu_idle',
68 window = 120,
69 periods = 2,
70 ['function'] = 'avg',
71 relational_operator = '<=',
72 threshold = 5,
73 },
74 {
75 metric = 'cpu_wait',
76 window = 120,
77 periods = 1,
78 ['function'] = 'avg',
79 relational_operator = '>=',
80 threshold = 20,
81 },
82 },
83 logical_operator = 'or',
84 },
85 severity = 'critical',
86 },
87 { -- 4
88 name = 'CPU_Warning_Controller',
89 description = 'CPU is warning for controller',
90 enabled = true,
91 trigger = {
92 rules = {
93 {
94 metric = 'cpu_idle',
95 window = 100,
96 periods = 2,
97 ['function'] = 'avg',
98 relational_operator = '<=',
99 threshold = 15,
100 },
101 {
102 metric = 'cpu_wait',
103 window = 60,
104 periods = 0,
105 ['function'] = 'avg',
106 relational_operator = '>=',
107 threshold = 25,
108 },
109 },
110 logical_operator = 'or',
111 },
112 severity = 'warning',
113 },
114 { -- 5
115 name = 'CPU_Critical_Controller_AND',
116 description = 'CPU is critical for controller',
117 enabled = true,
118 trigger = {
119 rules = {
120 {
121 metric = 'cpu_idle',
122 window = 120,
123 periods = 2,
124 ['function'] = 'avg',
125 relational_operator = '<=',
126 threshold = 3,
127 },
128 {
129 metric = 'cpu_wait',
130 window = 60,
131 periods = 1,
132 ['function'] = 'avg',
133 relational_operator = '>=',
134 threshold = 30,
135 },
136 },
137 logical_operator = 'and',
138 },
139 severity = 'critical',
140 },
141 { -- 6
142 name = 'FS_root',
143 description = 'FS root',
144 enabled = true,
145 trigger = {
146 rules = {
147 {
148 metric = 'fs_space_percent_free',
149 window = 120,
150 ['function'] = 'avg',
151 fields = { fs='/'},
152 relational_operator = '<=',
153 threshold = 10,
154 },
155 },
156 logical_operator = 'and',
157 },
158 severity = 'critical',
159 },
160 { -- 7
161 name = 'Backend_errors_5xx',
162 description = 'Errors 5xx on backends',
163 enabled = true,
164 trigger = {
165 rules = {
166 {
167 metric = 'haproxy_backend_response_5xx',
168 window = 30,
169 periods = 1,
170 ['function'] = 'diff',
171 relational_operator = '>',
172 threshold = 0,
173 },
174 },
175 logical_operator = 'or',
176 },
177 severity = 'warning',
178 },
179 { -- 8
180 name = 'nova_logs_errors_rate',
181 description = 'Rate of change for nova logs in error is too high',
182 enabled = true,
183 trigger = {
184 rules = {
185 {
186 metric = 'log_messages',
187 window = 60,
188 periods = 4,
189 ['function'] = 'roc',
190 threshold = 1.5,
191 },
192 },
193 },
194 severity = 'warning',
195 },
196 { -- 9
197 name = 'heartbeat',
198 description = 'No metric!',
199 enabled = true,
200 trigger = {
201 rules = {
202 {
203 metric = 'foo_heartbeat',
204 window = 60,
205 periods = 1,
206 ['function'] = 'last',
207 relational_operator = '==',
208 threshold = 0,
209 },
210 },
211 },
212 severity = 'down',
213 },
214}
215
216afd_on_multivalue = {
217 name = 'keystone-high-http-response-times',
218 description = 'The 90 percentile response time for Keystone is too high',
219 enabled = true,
220 trigger = {
221 rules = {
222 {
223 metric = 'http_response_times',
224 window = 60,
225 periods = 1,
226 ['function'] = 'max',
227 threshold = 5,
228 fields = { http_method = 'POST' },
229 relational_operator = '>=',
230 value = 'upper_90',
231 },
232 },
233 },
234 severity = 'warning',
235}
236
237missing_value_afd_on_multivalue = {
238 name = 'keystone-high-http-response-times',
239 description = 'The 90 percentile response time for Keystone is too high',
240 enabled = true,
241 trigger = {
242 rules = {
243 {
244 metric = 'http_response_times',
245 window = 30,
246 periods = 2,
247 ['function'] = 'max',
248 threshold = 5,
249 fields = { http_method = 'POST' },
250 relational_operator = '>=',
251 -- value = 'upper_90',
252 },
253 },
254 },
255 severity = 'warning',
256}
257
258TestLMAAlarm = {}
259
260local current_time = 0
261
262function TestLMAAlarm:tearDown()
263 lma_alarm.reset_alarms()
264 current_time = 0
265end
266
267local function next_time(inc)
268 if not inc then inc = 10 end
269 current_time = current_time + (inc*1e9)
270 return current_time
271end
272
273function TestLMAAlarm:test_start_evaluation()
274 lma_alarm.load_alarm(alarms[3]) -- window=120 period=2
275 lma_alarm.set_start_time(current_time)
276 local alarm = lma_alarm.get_alarm('CPU_Critical_Controller')
277 assertEquals(alarm:is_evaluation_time(next_time(10)), false) -- 10 seconds
278 assertEquals(alarm:is_evaluation_time(next_time(50)), false) -- 60 seconds
279 assertEquals(alarm:is_evaluation_time(next_time(60)), false) -- 120 seconds
280 assertEquals(alarm:is_evaluation_time(next_time(120)), true) -- 240 seconds
281 assertEquals(alarm:is_evaluation_time(next_time(240)), true) -- later
282end
283
284function TestLMAAlarm:test_not_the_time()
285 lma_alarm.load_alarms(alarms)
286 lma_alarm.set_start_time(current_time)
287 local state, _ = lma_alarm.evaluate(next_time()) -- no alarm w/ window <= 10s
288 assertEquals(state, nil)
289end
290
291function TestLMAAlarm:test_lookup_fields_for_metric()
292 lma_alarm.load_alarms(alarms)
293 local fields_required = lma_alarm.get_metric_fields('fs_space_percent_free')
294 assertItemsEquals(fields_required, {"fs"})
295end
296
297function TestLMAAlarm:test_lookup_empty_fields_for_metric()
298 lma_alarm.load_alarms(alarms)
299 local fields_required = lma_alarm.get_metric_fields('cpu_idle')
300 assertItemsEquals(fields_required, {})
301 local fields_required = lma_alarm.get_metric_fields('fs_space_percent_free')
302 assertItemsEquals(fields_required, {'fs'})
303end
304
305function TestLMAAlarm:test_lookup_interested_alarms()
306 lma_alarm.load_alarms(alarms)
307 local alarms = lma_alarm.get_interested_alarms('foometric')
308 assertEquals(#alarms, 0)
309 local alarms = lma_alarm.get_interested_alarms('cpu_wait')
310 assertEquals(#alarms, 3)
311
312end
313
314function TestLMAAlarm:test_get_alarms()
315 lma_alarm.load_alarms(alarms)
316 local all_alarms = lma_alarm.get_alarms()
317 local num = 0
318 for _, _ in pairs(all_alarms) do
319 num = num + 1
320 end
321 assertEquals(num, #alarms)
322end
323
324function TestLMAAlarm:test_no_datapoint()
325 lma_alarm.load_alarms(alarms)
326 lma_alarm.set_start_time(current_time)
327 local t = next_time(300) -- at this time all alarms can be evaluated
328 local state, results = lma_alarm.evaluate(t)
329 assertEquals(state, consts.UNKW)
330 assert(#results > 0)
331 for _, result in ipairs(results) do
332 assertEquals(result.alert.message, 'No datapoint have been received ever')
333 assertNotEquals(result.alert.fields, nil)
334 end
335end
336
337function TestLMAAlarm:test_rules_logical_op_and_no_alert()
338 lma_alarm.load_alarms(alarms)
339 local alarm = lma_alarm.get_alarm('CPU_Critical_Controller_AND')
340 lma_alarm.set_start_time(current_time)
341 local t1 = next_time(60) -- 60s
342 local t2 = next_time(60) -- 120s
343 local t3 = next_time(60) -- 180s
344 local t4 = next_time(60) -- 240s
345 lma_alarm.add_value(t1, 'cpu_wait', 3)
346 lma_alarm.add_value(t2, 'cpu_wait', 10)
347 lma_alarm.add_value(t3, 'cpu_wait', 1)
348 lma_alarm.add_value(t4, 'cpu_wait', 10)
349
350 lma_alarm.add_value(t1, 'cpu_idle', 30)
351 lma_alarm.add_value(t2, 'cpu_idle', 10)
352 lma_alarm.add_value(t3, 'cpu_idle', 10)
353 lma_alarm.add_value(t4, 'cpu_idle', 20)
354 local state, result = alarm:evaluate(t4)
355 assertEquals(#result, 0)
356 assertEquals(state, consts.OKAY)
357end
358
359function TestLMAAlarm:test_rules_logical_missing_datapoint__op_and()
360 lma_alarm.load_alarm(alarms[5])
361 lma_alarm.set_start_time(current_time)
362 local t1 = next_time(60)
363 local t2 = next_time(60)
364 local t3 = next_time(60)
365 local t4 = next_time(60)
366 lma_alarm.add_value(t1, 'cpu_wait', 0) -- 60s
367 lma_alarm.add_value(t2, 'cpu_wait', 2) -- 120s
368 lma_alarm.add_value(t3, 'cpu_wait', 5) -- 180s
369 lma_alarm.add_value(t4, 'cpu_wait', 6) -- 240s
370 lma_alarm.add_value(t1, 'cpu_idle', 20) -- 60s
371 lma_alarm.add_value(t2, 'cpu_idle', 20) -- 120s
372 lma_alarm.add_value(t3, 'cpu_idle', 20) -- 180s
373 lma_alarm.add_value(t4, 'cpu_idle', 20) -- 240s
374 local state, result = lma_alarm.evaluate(t4) -- 240s we can evaluate
375 assertEquals(state, consts.OKAY)
376 assertEquals(#result, 0)
377 local state, result = lma_alarm.evaluate(next_time(60)) -- 60s w/o datapoint
378 assertEquals(state, consts.OKAY)
379 -- cpu_wait have no data within its observation period
380 local state, result = lma_alarm.evaluate(next_time(1)) -- 61s w/o datapoint
381 assertEquals(state, consts.UNKW)
382 assertEquals(#result, 1)
383 assertEquals(result[1].alert.metric, 'cpu_wait')
384 assert(result[1].alert.message:match('No datapoint have been received over the last'))
385
386 -- both cpu_idle and cpu_wait have no data within their observation periods
387 local state, result = lma_alarm.evaluate(next_time(180)) -- 241s w/o datapoint
388 assertEquals(state, consts.UNKW)
389 assertEquals(#result, 2)
390 assertEquals(result[1].alert.metric, 'cpu_idle')
391 assert(result[1].alert.message:match('No datapoint have been received over the last'))
392 assertEquals(result[2].alert.metric, 'cpu_wait')
393 assert(result[2].alert.message:match('No datapoint have been received over the last'))
394
395 -- datapoints come back for both metrics
396 lma_alarm.add_value(next_time(), 'cpu_idle', 20)
397 lma_alarm.add_value(next_time(), 'cpu_idle', 20)
398 lma_alarm.add_value(next_time(), 'cpu_wait', 20)
399 lma_alarm.add_value(next_time(), 'cpu_wait', 20)
400 local state, result = lma_alarm.evaluate(next_time()) -- 240s we can evaluate
401 assertEquals(state, consts.OKAY)
402 assertEquals(#result, 0)
403end
404
405function TestLMAAlarm:test_rules_logical_missing_datapoint__op_and_2()
406 lma_alarm.load_alarm(alarms[5])
407 lma_alarm.set_start_time(current_time)
408 local t1 = next_time(60)
409 local t2 = next_time(60)
410 local t3 = next_time(60)
411 local t4 = next_time(60)
412 lma_alarm.add_value(t1, 'cpu_wait', 0) -- 60s
413 lma_alarm.add_value(t2, 'cpu_wait', 2) -- 120s
414 lma_alarm.add_value(t3, 'cpu_wait', 5) -- 180s
415 lma_alarm.add_value(t4, 'cpu_wait', 6) -- 240s
416 lma_alarm.add_value(t1, 'cpu_idle', 20) -- 60s
417 lma_alarm.add_value(t2, 'cpu_idle', 20) -- 120s
418 lma_alarm.add_value(t3, 'cpu_idle', 20) -- 180s
419 lma_alarm.add_value(t4, 'cpu_idle', 20) -- 240s
420 local state, result = lma_alarm.evaluate(t4) -- 240s we can evaluate
421 assertEquals(state, consts.OKAY)
422 assertEquals(#result, 0)
423 local state, result = lma_alarm.evaluate(next_time(60)) -- 60s w/o datapoint
424 assertEquals(state, consts.OKAY)
425 -- cpu_wait have no data within its observation period
426 local state, result = lma_alarm.evaluate(next_time(1)) -- 61s w/o datapoint
427 assertEquals(state, consts.UNKW)
428 assertEquals(#result, 1)
429 assertEquals(result[1].alert.metric, 'cpu_wait')
430 assert(result[1].alert.message:match('No datapoint have been received over the last'))
431
432 lma_alarm.add_value(next_time(170), 'cpu_wait', 20)
433 -- cpu_idle have no data within its observation period
434 local state, result = lma_alarm.evaluate(next_time())
435 assertEquals(state, consts.UNKW)
436 assertEquals(#result, 1)
437 assertEquals(result[1].alert.metric, 'cpu_idle')
438 assert(result[1].alert.message:match('No datapoint have been received over the last'))
439
440 -- datapoints come back for both metrics
441 lma_alarm.add_value(next_time(), 'cpu_idle', 20)
442 lma_alarm.add_value(next_time(), 'cpu_idle', 20)
443 lma_alarm.add_value(next_time(), 'cpu_wait', 20)
444 lma_alarm.add_value(next_time(), 'cpu_wait', 20)
445 local state, result = lma_alarm.evaluate(next_time()) -- 240s we can evaluate
446 assertEquals(state, consts.OKAY)
447 assertEquals(#result, 0)
448end
449
450function TestLMAAlarm:test_rules_logical_op_and()
451 lma_alarm.load_alarm(alarms[5])
452 local cpu_critical_and = lma_alarm.get_alarm('CPU_Critical_Controller_AND')
453 lma_alarm.add_value(next_time(1), 'cpu_wait', 30)
454 lma_alarm.add_value(next_time(1), 'cpu_wait', 30)
455 lma_alarm.add_value(next_time(1), 'cpu_wait', 35)
456
457 lma_alarm.add_value(next_time(2), 'cpu_idle', 0)
458 lma_alarm.add_value(next_time(2), 'cpu_idle', 1)
459 lma_alarm.add_value(next_time(2), 'cpu_idle', 7)
460 lma_alarm.add_value(next_time(2), 'cpu_idle', 2)
461 local state, result = cpu_critical_and:evaluate(current_time)
462 assertEquals(state, consts.CRIT)
463 assertEquals(#result, 2) -- both rules match: avg(cpu_wait)>=30 and avg(cpu_idle)<=15
464
465 lma_alarm.add_value(next_time(120), 'cpu_idle', 70)
466 lma_alarm.add_value(next_time(), 'cpu_idle', 70)
467 lma_alarm.add_value(next_time(), 'cpu_idle', 70)
468 lma_alarm.add_value(next_time(), 'cpu_wait', 40)
469 lma_alarm.add_value(next_time(), 'cpu_wait', 38)
470 local state, result = cpu_critical_and:evaluate(current_time)
471 assertEquals(state, consts.OKAY)
472 assertEquals(#result, 0) -- avg(cpu_wait)>=30 matches but not avg(cpu_idle)<=15
473
474 lma_alarm.add_value(next_time(200), 'cpu_idle', 70)
475 lma_alarm.add_value(next_time(), 'cpu_idle', 70)
476 local state, result = cpu_critical_and:evaluate(current_time)
477 assertEquals(state, consts.UNKW)
478 assertEquals(#result, 1) -- no data for avg(cpu_wait)>=30 and avg(cpu_idle)<=3 doesn't match
479
480 next_time(240) -- spend enough time to invalidate datapoints of cpu_wait
481 lma_alarm.add_value(current_time, 'cpu_idle', 2)
482 lma_alarm.add_value(next_time(), 'cpu_idle', 2)
483 local state, result = cpu_critical_and:evaluate(current_time)
484 assertEquals(state, consts.UNKW)
485 assertEquals(#result, 2) -- no data for avg(cpu_wait)>=30 and avg(cpu_idle)<=3 matches
486end
487
488function TestLMAAlarm:test_rules_logical_op_or_one_alert()
489 lma_alarm.load_alarms(alarms)
490 local cpu_warn_and = lma_alarm.get_alarm('CPU_Warning_Controller')
491 lma_alarm.add_value(next_time(), 'cpu_wait', 15)
492 lma_alarm.add_value(next_time(), 'cpu_wait', 10)
493 lma_alarm.add_value(next_time(), 'cpu_wait', 20)
494
495 lma_alarm.add_value(next_time(), 'cpu_idle', 11)
496 lma_alarm.add_value(next_time(), 'cpu_idle', 8)
497 lma_alarm.add_value(next_time(), 'cpu_idle', 7)
498 local state, result = cpu_warn_and:evaluate(current_time)
499 assertEquals(state, consts.WARN)
500 assertEquals(#result, 1) -- avg(cpu_wait) IS NOT >=25 and avg(cpu_idle)<=2
501end
502
503function TestLMAAlarm:test_rules_logical_op_or_all_alert()
504 lma_alarm.load_alarm(alarms[4])
505 local cpu_warn_and = lma_alarm.get_alarm('CPU_Warning_Controller')
506 lma_alarm.add_value(next_time(), 'cpu_wait', 35)
507 lma_alarm.add_value(next_time(), 'cpu_wait', 20)
508 lma_alarm.add_value(next_time(), 'cpu_wait', 32)
509
510 lma_alarm.add_value(next_time(), 'cpu_idle', 3)
511 lma_alarm.add_value(next_time(), 'cpu_idle', 2.5)
512 lma_alarm.add_value(next_time(), 'cpu_idle', 1.5)
513 local state, result = cpu_warn_and:evaluate(current_time)
514 assertEquals(state, consts.WARN)
515 assertEquals(#result, 2) -- avg(cpu_wait) >=25 and avg(cpu_idle)<=3
516end
517
518function TestLMAAlarm:test_min()
519 lma_alarm.load_alarms(alarms)
520 lma_alarm.add_value(next_time(), 'rabbitmq_messages', 50)
521 lma_alarm.add_value(next_time(), 'rabbitmq_messages', 100)
522 lma_alarm.add_value(next_time(), 'rabbitmq_messages', 75)
523 lma_alarm.add_value(next_time(), 'rabbitmq_messages', 81)
524 local rabbitmq_critical = lma_alarm.get_alarm('RabbitMQ_Critical')
525 assertEquals(rabbitmq_critical.severity, consts.CRIT)
526 local state_crit, result = rabbitmq_critical:evaluate(current_time)
527 assertEquals(state_crit, consts.CRIT) -- min()>=50
528 assertEquals(#result, 1)
529 assertEquals(result[1].value, 50)
530end
531
532 function TestLMAAlarm:test_max()
533 local a = {
534 name = 'foo alert',
535 description = 'foo description',
536 trigger = {
537 rules = {
538 {
539 metric = 'rabbitmq_queue_messages',
540 window = 30,
541 periods = 2,
542 ['function'] = 'max',
543 threshold = 200,
544 relational_operator = '>=',
545 },
546 },
547 },
548 severity = 'warning',
549 }
550 lma_alarm.load_alarm(a)
551 lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 0, {queue = 'queue-XX', hostname = 'node-x'})
552 lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 260, {queue = 'queue-XX', hostname = 'node-x'})
553 lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 200, {queue = 'queue-XX', hostname = 'node-x'})
554 lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 152, {queue = 'queue-XX', hostname = 'node-x'})
555 lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 152, {queue = 'nova', hostname = 'node-x'})
556 lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 532, {queue = 'nova', hostname = 'node-x'})
557 local state_warn, result = lma_alarm.evaluate(current_time)
558 assertEquals(state_warn, consts.WARN)
559 assertEquals(#result, 1)
560 assertEquals(result[1].alert['function'], 'max')
561 assertEquals(result[1].alert.value, 532)
562 end
563
564function TestLMAAlarm:test_diff()
565 lma_alarm.load_alarms(alarms)
566 local errors_5xx = lma_alarm.get_alarm('Backend_errors_5xx')
567 assertEquals(errors_5xx.severity, consts.WARN)
568
569 -- with 5xx errors
570 lma_alarm.add_value(next_time(), 'haproxy_backend_response_5xx', 1)
571 lma_alarm.add_value(next_time(), 'haproxy_backend_response_5xx', 11) -- +10s
572 lma_alarm.add_value(next_time(), 'haproxy_backend_response_5xx', 21) -- +10s
573 local state, result = errors_5xx:evaluate(current_time)
574 assertEquals(state, consts.WARN)
575 assertEquals(#result, 1)
576 assertEquals(result[1].value, 20)
577
578 -- without 5xx errors
579 lma_alarm.add_value(next_time(), 'haproxy_backend_response_5xx', 21)
580 lma_alarm.add_value(next_time(), 'haproxy_backend_response_5xx', 21) -- +10s
581 lma_alarm.add_value(next_time(), 'haproxy_backend_response_5xx', 21) -- +10s
582 local state, result = errors_5xx:evaluate(current_time)
583 assertEquals(state, consts.OKAY)
584 assertEquals(#result, 0)
585
586 -- missing data
587 local state, result = errors_5xx:evaluate(next_time(60))
588 assertEquals(state, consts.UNKW)
589end
590
591function TestLMAAlarm:test_roc()
592 lma_alarm.load_alarms(alarms)
593 local errors_logs = lma_alarm.get_alarm('nova_logs_errors_rate')
594 assertEquals(errors_logs.severity, consts.WARN)
595 local m_values = {}
596
597 -- Test one error in the current window
598 m_values = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -- historical window 0
599 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -- historical window 0
600 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -- historical window 3
601 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -- historical window 4
602 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -- previous window
603 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 } -- current window
604 for _,v in pairs(m_values) do
605 lma_alarm.add_value(next_time(5), 'log_messages', v, {service = 'nova', level = 'error'})
606 end
607 local state, _ = errors_logs:evaluate(current_time)
608 assertEquals(state, consts.WARN)
609
610 -- Test one error in the historical window
611 m_values = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -- historical window 0
612 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -- historical window 0
613 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -- historical window 3
614 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -- historical window 4
615 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -- previous window
616 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } -- current window
617 for _,v in pairs(m_values) do
618 lma_alarm.add_value(next_time(5), 'log_messages', v, {service = 'nova', level = 'error'})
619 end
620 local state, _ = errors_logs:evaluate(current_time)
621 assertEquals(state, consts.OKAY)
622
623 -- with rate errors
624 m_values = { 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, -- historical window 1
625 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, -- historical window 2
626 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, -- historical window 3
627 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, -- historical window 4
628 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, -- previous window
629 1, 2, 1, 1, 1, 2, 1, 5, 5, 7, 1, 7 } -- current window
630 for _,v in pairs(m_values) do
631 lma_alarm.add_value(next_time(5), 'log_messages', v, {service = 'nova', level = 'error'})
632 end
633 local state, _ = errors_logs:evaluate(current_time)
634 assertEquals(state, consts.WARN)
635
636 -- without rate errors
637 m_values = { 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, -- historical window 1
638 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, -- historical window 2
639 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, -- historical window 3
640 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, -- historical window 4
641 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, -- previous window
642 1, 2, 1, 1, 1, 2, 1, 3, 4, 3, 3, 4 } -- current window
643 for _,v in pairs(m_values) do
644 lma_alarm.add_value(next_time(5), 'log_messages', v, {service = 'nova', level = 'error'})
645 end
646 local state, _ = errors_logs:evaluate(current_time)
647 assertEquals(state, consts.OKAY)
648end
649
650function TestLMAAlarm:test_alarm_first_match()
651 lma_alarm.load_alarm(alarms[3]) -- cpu critical (window 240s)
652 lma_alarm.load_alarm(alarms[4]) -- cpu warning (window 120s)
653 lma_alarm.set_start_time(current_time)
654
655 next_time(240) -- both alarms can now be evaluated
656 lma_alarm.add_value(next_time(), 'cpu_idle', 15)
657 lma_alarm.add_value(next_time(), 'cpu_wait', 9)
658 local state, result = lma_alarm.evaluate(next_time())
659 assertEquals(state, consts.WARN) -- 2nd alarm raised
660 assertEquals(#result, 1) -- cpu_idle match (<= 15) and cpu_wait don't match (>= 25)
661
662 next_time(240) -- both alarms can now be evaluated with new datapoints
663 lma_alarm.add_value(next_time(), 'cpu_wait', 15)
664 lma_alarm.add_value(next_time(), 'cpu_idle', 4)
665 local state, result = lma_alarm.evaluate(next_time())
666 assertEquals(state, consts.CRIT) -- first alarm raised
667 assertEquals(#result, 1) -- cpu_idle match (<= 5) and cpu_wait don't match (>= 20)
668end
669
670function TestLMAAlarm:test_rules_fields()
671 lma_alarm.load_alarm(alarms[1]) -- FS_all_no_field
672 lma_alarm.load_alarm(alarms[6]) -- FS_root
673 lma_alarm.set_start_time(current_time)
674
675 local t = next_time()
676 lma_alarm.add_value(t, 'fs_space_percent_free', 6, {fs = '/'})
677 lma_alarm.add_value(t, 'fs_space_percent_free', 6 )
678 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 12, {fs = '/'})
679 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 17 )
680 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 6, {fs = '/'})
681 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 6, {fs = 'foo'})
682 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 3, {fs = 'foo'})
683 local t = next_time()
684
685 local root_fs = lma_alarm.get_alarm('FS_root')
686 local state, result = root_fs:evaluate(t)
687 assertEquals(#result, 1)
688 assertItemsEquals(result[1].fields, {fs='/'})
689 assertEquals(result[1].value, 8)
690
691
692 local root_fs = lma_alarm.get_alarm('FS_all_no_field')
693 local state, result = root_fs:evaluate(t)
694 assertEquals(#result, 1)
695
696 assertItemsEquals(result[1].fields, {})
697 assertEquals(result[1].value, 8)
698end
699
700function TestLMAAlarm:test_last_fct()
701 lma_alarm.load_alarm(alarms[9])
702 lma_alarm.set_start_time(current_time)
703
704 lma_alarm.add_value(next_time(), 'foo_heartbeat', 1)
705 lma_alarm.add_value(next_time(), 'foo_heartbeat', 1)
706 lma_alarm.add_value(next_time(), 'foo_heartbeat', 0)
707 lma_alarm.add_value(next_time(), 'foo_heartbeat', 1)
708 lma_alarm.add_value(next_time(), 'foo_heartbeat', 0)
709 local state, result = lma_alarm.evaluate(next_time())
710 assertEquals(state, consts.DOWN)
711 next_time(61)
712 local state, result = lma_alarm.evaluate(next_time())
713 assertEquals(state, consts.UNKW)
714 lma_alarm.add_value(next_time(), 'foo_heartbeat', 0)
715 local state, result = lma_alarm.evaluate(next_time())
716 assertEquals(state, consts.DOWN)
717 lma_alarm.add_value(next_time(), 'foo_heartbeat', 1)
718 local state, result = lma_alarm.evaluate(next_time())
719 assertEquals(state, consts.OKAY)
720end
721
722function TestLMAAlarm:test_rule_with_multivalue()
723 lma_alarm.load_alarm(afd_on_multivalue)
724 lma_alarm.set_start_time(current_time)
725
726 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 0.4, foo = 1}, {http_method = 'POST'})
727 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 0.2, foo = 1}, {http_method = 'POST'})
728 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 6, foo = 1}, {http_method = 'POST'})
729 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 3, foo = 1}, {http_method = 'POST'})
730 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 4, foo = 1}, {http_method = 'POST'})
731 local state, result = lma_alarm.evaluate(next_time()) -- window 60 second
732 assertEquals(state, consts.WARN)
733 assertItemsEquals(result[1].alert.fields, {http_method='POST'})
734 assertEquals(result[1].alert.value, 6)
735end
736
737function TestLMAAlarm:test_nocrash_missing_value_with_multivalue_metric()
738 lma_alarm.load_alarm(missing_value_afd_on_multivalue)
739 lma_alarm.set_start_time(current_time)
740
741 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 0.4, foo = 1}, {http_method = 'POST'})
742 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 0.2, foo = 1}, {http_method = 'POST'})
743 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 6, foo = 1}, {http_method = 'POST'})
744 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 3, foo = 1}, {http_method = 'POST'})
745 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 4, foo = 1}, {http_method = 'POST'})
746 local state, result = lma_alarm.evaluate(next_time()) -- window 60 second
747 assertEquals(state, consts.UNKW)
748end
749
750function TestLMAAlarm:test_complex_field_matching_alarm_trigger()
751 local alert = {
752 name = 'keystone-high-http-response-times',
753 description = 'The 90 percentile response time for Keystone is too high',
754 enabled = true,
755 trigger = {
756 rules = {
757 {
758 metric = 'http_response_times',
759 window = 30,
760 periods = 2,
761 ['function'] = 'max',
762 threshold = 5,
763 fields = { http_method = 'POST || GET',
764 http_status = '2xx || ==3xx'},
765 relational_operator = '>=',
766 value = 'upper_90',
767 },
768 },
769 },
770 severity = 'warning',
771 }
772 lma_alarm.load_alarm(alert)
773 lma_alarm.set_start_time(current_time)
774
775 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 0.4, foo = 1}, {http_method = 'POST', http_status = '2xx'})
776 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 0.2, foo = 1}, {http_method = 'POST', http_status = '2xx'})
777 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 6, foo = 1}, {http_method = 'POST', http_status = '3xx'})
778 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 999, foo = 1}, {http_method = 'POST', http_status = '5xx'})
779 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 3, foo = 1}, {http_method = 'GET', http_status = '2xx'})
780 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 4, foo = 1}, {http_method = 'POST', http_status = '2xx'})
781 local state, result = lma_alarm.evaluate(next_time()) -- window 60 second
782 assertEquals(state, consts.WARN)
783 assertEquals(result[1].alert.value, 6) -- the max
784 assertItemsEquals(result[1].alert.fields, {http_method='POST || GET', http_status='2xx || ==3xx'})
785end
786
787function TestLMAAlarm:test_complex_field_matching_alarm_ok()
788 local alert = {
789 name = 'keystone-high-http-response-times',
790 description = 'The 90 percentile response time for Keystone is too high',
791 enabled = true,
792 trigger = {
793 rules = {
794 {
795 metric = 'http_response_times',
796 window = 30,
797 periods = 2,
798 ['function'] = 'avg',
799 threshold = 5,
800 fields = { http_method = 'POST || GET',
801 http_status = '2xx || 3xx'},
802 relational_operator = '>=',
803 value = 'upper_90',
804 },
805 },
806 },
807 severity = 'warning',
808 }
809
810 lma_alarm.load_alarm(alert)
811 lma_alarm.set_start_time(current_time)
812
813 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 0.4, foo = 1}, {http_method = 'POST', http_status = '2xx'})
814 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 0.2, foo = 1}, {http_method = 'POST', http_status = '2xx'})
815 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 6, foo = 1}, {http_method = 'POST', http_status = '2xx'})
816 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 3, foo = 1}, {http_method = 'GET', http_status = '2xx'})
817 lma_alarm.add_value(next_time(), 'http_response_times', {upper_90 = 4, foo = 1}, {http_method = 'POST', http_status = '2xx'})
818 local state, result = lma_alarm.evaluate(next_time()) -- window 60 second
819 assertEquals(state, consts.OKAY)
820end
821
822function TestLMAAlarm:test_group_by_required_field()
823 local alert = {
824 name = 'foo-alarm',
825 description = 'foo description',
826 enabled = true,
827 trigger = {
828 rules = {
829 {
830 metric = 'foo_metric_name',
831 window = 30,
832 periods = 1,
833 ['function'] = 'avg',
834 fields = { foo = 'bar', bar = 'foo' },
835 group_by = {'fs'},
836 relational_operator = '<=',
837 threshold = 5,
838 },
839 },
840 },
841 severity = 'warning',
842 }
843 lma_alarm.load_alarm(alert)
844 local fields = lma_alarm.get_metric_fields('foo_metric_name')
845 assertItemsEquals(fields, { "fs", "foo", "bar" })
846
847 local fields = lma_alarm.get_metric_fields('non_existant_metric')
848 assertItemsEquals(fields, {})
849end
850
851function TestLMAAlarm:test_group_by_one_field()
852 local alert = {
853 name = 'osd-filesystem-warning',
854 description = 'free space is too low',
855 enabled = true,
856 trigger = {
857 rules = {
858 {
859 metric = 'fs_space_percent_free',
860 window = 30,
861 periods = 1,
862 ['function'] = 'avg',
863 fields = { fs = '=~ osd%-%d && !~ /var/log' },
864 group_by = {'fs'},
865 relational_operator = '<=',
866 threshold = 5,
867 },
868 },
869 },
870 severity = 'warning',
871 }
872 lma_alarm.load_alarm(alert)
873 lma_alarm.set_start_time(current_time)
874
875 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 5, {fs = 'osd-1'})
876 lma_alarm.add_value(current_time, 'fs_space_percent_free', 4, {fs = 'osd-2'})
877 lma_alarm.add_value(current_time, 'fs_space_percent_free', 80, {fs = 'osd-3'})
878 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 4, {fs = 'osd-1'})
879 lma_alarm.add_value(current_time, 'fs_space_percent_free', 3, {fs = 'osd-2'})
880 lma_alarm.add_value(current_time, 'fs_space_percent_free', 80, {fs = 'osd-3'})
881 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 4, {fs = 'osd-1'})
882 lma_alarm.add_value(current_time, 'fs_space_percent_free', 2, {fs = 'osd-2'})
883 lma_alarm.add_value(current_time, 'fs_space_percent_free', 80, {fs = 'osd-3'})
884 lma_alarm.add_value(current_time, 'fs_space_percent_free', 1, {fs = '/var/log/osd-3'})
885
886 local state, result = lma_alarm.evaluate(next_time()) -- window 60 second
887 assertEquals(#result, 2)
888 assertEquals(state, consts.WARN)
889
890 next_time(100) -- spend enough time to invalidate datapoints
891 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 50, {fs = 'osd-1'})
892 lma_alarm.add_value(current_time, 'fs_space_percent_free', 50, {fs = 'osd-2'})
893 lma_alarm.add_value(current_time, 'fs_space_percent_free', 50, {fs = 'osd-3'})
894 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 50, {fs = 'osd-1'})
895 lma_alarm.add_value(current_time, 'fs_space_percent_free', 50, {fs = 'osd-2'})
896 lma_alarm.add_value(current_time, 'fs_space_percent_free', 50, {fs = 'osd-3'})
897 local state, result = lma_alarm.evaluate(next_time()) -- window 60 second
898 assertEquals(#result, 0)
899 assertEquals(state, consts.OKAY)
900end
901
902function TestLMAAlarm:test_group_by_several_fields()
903 local alert = {
904 name = 'osd-filesystem-warning',
905 description = 'free space is too low',
906 enabled = true,
907 trigger = {
908 rules = {
909 {
910 metric = 'fs_space_percent_free',
911 window = 30,
912 periods = 1,
913 ['function'] = 'last',
914 fields = {},
915 group_by = {'fs', 'osd'},
916 relational_operator = '<=',
917 threshold = 5,
918 },
919 },
920 },
921 severity = 'warning',
922 }
923 lma_alarm.load_alarm(alert)
924 lma_alarm.set_start_time(current_time)
925
926 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 5, {fs = '/foo', osd = '1'})
927 lma_alarm.add_value(current_time, 'fs_space_percent_free', 4, {fs = '/foo', osd = '2'})
928 lma_alarm.add_value(current_time, 'fs_space_percent_free', 80, {fs = '/foo', osd = '3'})
929
930 local state, result = lma_alarm.evaluate(next_time(20))
931 assertEquals(state, consts.WARN)
932 -- one item for {fs = '/foo', osd = '1'} and another one for {fs = '/foo', osd = '2'}
933 assertEquals(#result, 2)
934
935 next_time(100) -- spend enough time to invalidate datapoints
936
937 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 5, {fs = '/foo', osd = '1'})
938 lma_alarm.add_value(current_time, 'fs_space_percent_free', 4, {fs = '/foo', osd = '2'})
939 lma_alarm.add_value(current_time, 'fs_space_percent_free', 80, {fs = '/foo', osd = '3'})
940 lma_alarm.add_value(current_time, 'fs_space_percent_free', 15, {fs = '/bar', osd = '1'})
941 lma_alarm.add_value(current_time, 'fs_space_percent_free', 14, {fs = '/bar', osd = '2'})
942 lma_alarm.add_value(current_time, 'fs_space_percent_free', 2, {fs = '/bar', osd = '3'})
943 local state, result = lma_alarm.evaluate(next_time(20))
944 assertEquals(state, consts.WARN)
945 -- one item for {fs = '/foo', osd = '1'}, another one for {fs = '/foo', osd = '2'}
946 -- and another one for {fs = '/bar', osd = '3'}
947 assertEquals(#result, 3)
948end
949
950function TestLMAAlarm:test_group_by_missing_field_is_unknown()
951 local alert = {
952 name = 'osd-filesystem-warning',
953 description = 'free space is too low',
954 enabled = true,
955 trigger = {
956 rules = {
957 {
958 metric = 'fs_space_percent_free',
959 window = 30,
960 periods = 1,
961 ['function'] = 'avg',
962 fields = { fs = '=~ osd%-%d && !~ /var/log' },
963 group_by = {'fs'},
964 relational_operator = '<=',
965 threshold = 5,
966 },
967 },
968 },
969 severity = 'warning',
970 }
971 lma_alarm.load_alarm(alert)
972 lma_alarm.set_start_time(current_time)
973
974 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 5)
975 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 4)
976 lma_alarm.add_value(next_time(), 'fs_space_percent_free', 4)
977
978 local state, result = lma_alarm.evaluate(next_time())
979 assertEquals(#result, 1)
980 assertEquals(state, consts.UNKW)
981end
982
983function TestLMAAlarm:test_no_data_policy_okay()
984 local alarm = {
985 name = 'foo-alarm',
986 description = 'foo description',
987 enabled = true,
988 trigger = {
989 rules = {
990 {
991 metric = 'foo_metric_name',
992 window = 30,
993 periods = 1,
994 ['function'] = 'avg',
995 fields = { foo = 'bar', bar = 'foo' },
996 group_by = {'fs'},
997 relational_operator = '<=',
998 threshold = 5,
999 },
1000 },
1001 },
1002 severity = 'warning',
1003 no_data_policy = 'okay',
1004 }
1005 lma_alarm.load_alarm(alarm)
1006 lma_alarm.set_start_time(current_time)
1007
1008 lma_alarm.add_value(next_time(100), 'another_metric', 5)
1009
1010 local state, result = lma_alarm.evaluate(next_time())
1011 assertEquals(#result, 0)
1012 assertEquals(state, consts.OKAY)
1013end
1014
1015function TestLMAAlarm:test_no_data_policy_critical()
1016 local alarm = {
1017 name = 'foo-alarm',
1018 description = 'foo description',
1019 enabled = true,
1020 trigger = {
1021 rules = {
1022 {
1023 metric = 'foo_metric_name',
1024 window = 30,
1025 periods = 1,
1026 ['function'] = 'avg',
1027 fields = { foo = 'bar', bar = 'foo' },
1028 group_by = {'fs'},
1029 relational_operator = '<=',
1030 threshold = 5,
1031 },
1032 },
1033 },
1034 severity = 'critical',
1035 no_data_policy = 'critical',
1036 }
1037 lma_alarm.load_alarm(alarm)
1038 lma_alarm.set_start_time(current_time)
1039
1040 lma_alarm.add_value(next_time(100), 'another_metric', 5)
1041
1042 local state, result = lma_alarm.evaluate(next_time())
1043 assertEquals(#result, 1)
1044 assertEquals(state, consts.CRIT)
1045end
1046
1047function TestLMAAlarm:test_no_data_policy_skip()
1048 local alarm = {
1049 name = 'foo-alarm',
1050 description = 'foo description',
1051 enabled = true,
1052 trigger = {
1053 rules = {
1054 {
1055 metric = 'foo_metric_name',
1056 window = 30,
1057 periods = 1,
1058 ['function'] = 'avg',
1059 fields = { foo = 'bar', bar = 'foo' },
1060 group_by = {'fs'},
1061 relational_operator = '<=',
1062 threshold = 5,
1063 },
1064 },
1065 },
1066 severity = 'critical',
1067 no_data_policy = 'skip',
1068 }
1069 lma_alarm.load_alarm(alarm)
1070 lma_alarm.set_start_time(current_time)
1071
1072 lma_alarm.add_value(next_time(100), 'another_metric', 5)
1073
1074 local state, result = lma_alarm.evaluate(next_time())
1075 assertEquals(state, nil)
1076end
1077
1078lu = LuaUnit
1079lu:setVerbosity( 1 )
1080os.exit( lu:run() )