blob: c321149d9657fc4b39252ba2c6ea72d03f7e80db [file] [log] [blame]
Alexe0c5b9e2019-04-23 18:51:23 -05001import subprocess
2import socket
3import salt.utils
4import logging
5import os
6import re
7import json
8
9__author__ = "Dzmitry Stremkouski"
10__copyright__ = "Copyright 2019, Mirantis Inc."
11__license__ = "Apache 2.0"
12
13logger = logging.getLogger(__name__)
14stream = logging.StreamHandler()
15logger.addHandler(stream)
16
17
18def _failed_minions(out, agent, failed_minions):
19
20 ''' Verify failed minions '''
21
22 if len(failed_minions) > 0:
23 logger.error("%s check FAILED" % agent)
24 logger.error("Some minions returned non-zero exit code or empty data")
25 logger.error("Failed minions:" + str(failed_minions))
26 for minion in failed_minions:
27 logger.error(minion)
28 logger.debug(str(out[minion]['ret']))
29 __context__['retcode'] = 2
30 return False
31
32 return True
33
34
35def _minions_output(out, agent, ignore_dead, ignore_empty=False):
36
37 ''' Verify minions output and exit code '''
38
39 if not out:
40 logger.error("%s check FAILED" % agent)
41 logger.error("No response from master cmd")
42 __context__['retcode'] = 2
43 return False
44
45 if not ignore_dead:
46 jid = out.itervalues().next()['jid']
47 job_stats = __salt__['saltutil.runner']( 'jobs.print_job', arg=[jid] ) or None
48 if not job_stats:
49 logger.error("%s check FAILED" % agent)
50 logger.error("No response from master runner")
51 __context__['retcode'] = 2
52 return False
53
54 job_result = job_stats[jid]['Result']
55 job_minions = job_stats[jid]['Minions']
56 if len(job_minions) != len(job_result):
57 logger.error("%s check FAILED" % agent)
58 logger.error("Some minions are offline")
59 logger.error(list(set(job_minions) - set(job_result.keys())))
60 __context__['retcode'] = 2
61 return False
62
63 failed_minions = []
64 for minion in out:
65 if 'retcode' in out[minion]:
66 if out[minion]['retcode'] == 0:
67 if not ignore_empty:
68 if isinstance(out[minion]['ret'], bool):
69 if minion not in failed_minions:
70 failed_minions.append(minion)
71 elif len(out[minion]['ret']) == 0:
72 if minion not in failed_minions:
73 failed_minions.append(minion)
74 else:
75 if minion not in failed_minions:
76 failed_minions.append(minion)
77 else:
78 if minion not in failed_minions:
79 failed_minions.append(minion)
80
81 if not _failed_minions(out, agent, failed_minions):
82 __context__['retcode'] = 2
83 return False
84
85 return True
86
87
88def minions_check(wait_timeout=1, gather_job_wait_timeout=1, target='*', target_type='glob', ignore_dead=False):
89
90 ''' Verify minions are online '''
91
92 agent = "Minions"
93 out = __salt__['saltutil.cmd']( tgt=target,
94 tgt_type=target_type,
95 fun='test.ping',
96 timeout=wait_timeout,
97 gather_job_timeout=gather_job_wait_timeout
98 ) or None
99
100 return _minions_output(out, agent, ignore_dead, ignore_empty=True)
101
102
103def time_diff_check(time_diff=1, target='*', target_type='glob', ignore_dead=False, **kwargs):
104
105 ''' Verify time diff on servers '''
106
107 agent = "Time diff"
108 out = __salt__['saltutil.cmd']( tgt=target,
109 tgt_type=target_type,
110 fun='status.time',
111 arg=['%s'],
112 timeout=3
113 ) or None
114
115 if not _minions_output(out, agent, ignore_dead):
116 __context__['retcode'] = 2
117 return False
118
119 minions_times = {}
120 env_times = []
121 verified_minions = []
122
123 for minion in out:
124 verified_minions.append(minion)
125 if out[minion]['retcode'] == 0:
126 minion_time = int(out[minion]['ret'])
127 if str(minion_time) not in minions_times:
128 minions_times[str(minion_time)] = []
129 minions_times[str(minion_time)].append(minion)
130 env_times.append(minion_time)
131
132 env_times.sort()
133 diff = env_times[-1] - env_times[0]
134
135 if diff > time_diff:
136 __context__['retcode'] = 2
137 if kwargs.get("debug", False):
138 return False, minions_times
139 else:
140 return False
141
142 if kwargs.get("debug", False):
143 logger.info(verified_minions)
144 return True
145
146
147def contrail_check(target='I@contrail:control or I@contrail:collector or I@opencontrail:compute or I@opencontrail:client', target_type='compound', ignore_dead=False, **kwargs):
148
149 ''' Verify contrail status returns nothing critical '''
150
151 agent = "Contrail status"
152 out = __salt__['saltutil.cmd']( tgt=target,
153 tgt_type=target_type,
154 fun='cmd.run',
155 arg=['contrail-status'],
156 timeout=5
157 ) or None
158
159 if not _minions_output(out, agent, ignore_dead):
160 __context__['retcode'] = 2
161 return False
162
163 failed_minions = []
164 pattern = '^(==|$|\S+\s+(active|backup|inactive\s\(disabled\son\sboot\)))'
165 prog = re.compile(pattern)
166
167 validated = []
168 for minion in out:
169 for line in out[minion]['ret'].split('\n'):
170 if not prog.match(line) and minion not in failed_minions:
171 failed_minions.append(minion)
172 validated.append(minion)
173
174 if not _failed_minions(out, agent, failed_minions):
175 __context__['retcode'] = 2
176 return False
177
178 if kwargs.get("debug", False):
179 logger.info(validated)
180 return True
181
182
183def galera_check(cluster_size=3, target='I@galera:master or I@galera:slave', target_type='compound', ignore_dead=False, **kwargs):
184
185 ''' Verify galera cluster size and state '''
186
187 agent = "Galera status"
188 out = __salt__['saltutil.cmd']( tgt=target,
189 tgt_type=target_type,
190 fun='mysql.status',
191 timeout=3
192 ) or None
193
194 if not _minions_output(out, agent, ignore_dead):
195 __context__['retcode'] = 2
196 return False
197
198 failed_minions = []
199
200 validated = []
201 for minion in out:
202 if int(out[minion]['ret']['wsrep_cluster_size']) != int(cluster_size) and minion not in failed_minions:
203 failed_minions.append(minion)
204 if out[minion]['ret']['wsrep_evs_state'] != 'OPERATIONAL' and minion not in failed_minions:
205 failed_minions.append(minion)
206 validated.append(minion)
207
208 if not _failed_minions(out, agent, failed_minions):
209 __context__['retcode'] = 2
210 return False
211
212 if kwargs.get("debug", False):
213 logger.info(validated)
214 logger.info("Cluster size: " + str(out[validated[0]]['ret']['wsrep_cluster_size']))
215 logger.info("Cluster state: " + str(out[validated[0]]['ret']['wsrep_evs_state']))
216 return True
217
218
219def _quote_str(s, l=False, r=False):
220
221 ''' Quting rabbitmq erl objects for json import '''
222
223 if len(s) > 0:
224 if l:
225 s = s.lstrip()
226 if r:
227 s = s.rstrip()
228 if (s[0] == "'") and (s[-1] != "'") and r and not l:
229 s += "'"
230 if (s[0] == '"') and (s[-1] != '"') and r and not l:
231 s += '"'
232 if (s[-1] == "'") and (s[0] != "'") and l and not r:
233 s = "'" + s
234 if (s[-1] == '"') and (s[0] != '"') and l and not r:
235 s = '"' + s
236 if (s[-1] != "'") and (s[-1] != '"') and (s[0] != "'") and (s[0] != '"'):
237 s = '"' + s.replace('"', '\\\"') + '"'
238 else:
239 if (not l) and (not r) and s[0] != '"' and not s[-1] != '"':
240 s= s.replace('"', '\\\"')
241 return s.replace("'", '"')
242 else:
243 return s
244
245
246def _sanitize_rmqctl_output(string):
247
248 ''' Sanitizing rabbitmq erl objects for json import '''
249
250 rabbitctl_json = ""
251 for line in string.split(','):
252 copy = line
253 left = ""
254 right = ""
255 mid = copy
256 lpar = False
257 rpar = False
258 if re.search('([\[\{\s]+)(.*)', copy):
259 mid = re.sub('^([\[\{\s]+)','', copy)
260 left = copy[:-len(mid)]
261 copy = mid
262 lpar = True
263 if re.search('(.*)([\]\}\s]+)$', copy):
264 mid = re.sub('([\]\}\s]+)$','', copy)
265 right = copy[len(mid):]
266 copy = mid
267 rpar = True
268 result = left + _quote_str(mid, l=lpar, r=rpar) + right
269 if (not rpar) and lpar and (len(left.strip()) > 0) and (left.strip()[-1] == '{'):
270 result += ":"
271 else:
272 result += ","
273 rabbitctl_json += result
274
275 rabbitctl_json = rabbitctl_json[:-1]
276 new_rabbitctl_json = rabbitctl_json
277 for s in re.findall('"[^:\[{\]}]+"\s*:\s*("[^\[{\]}]+")', rabbitctl_json):
278 if '"' in s[1:][:-1]:
279 orig = s
280 changed = '"' + s.replace('\\', '\\\\').replace('"', '\\\"') + '"'
281 new_rabbitctl_json = new_rabbitctl_json.replace(orig, changed)
282 return new_rabbitctl_json
283
284
285def rabbitmq_cmd(cmd):
286
287 ''' JSON formatted RabbitMQ command output '''
288
289 supported_commands = ['status', 'cluster_status', 'list_hashes', 'list_ciphers']
290 if cmd not in supported_commands:
291 logger.error("Command is not supported yet, sorry")
292 logger.error("Supported commands are: " + str(supported_commands))
293 __context__['retcode'] = 2
294 return False
295
296 proc = subprocess.Popen(['rabbitmqctl', cmd], stdout=subprocess.PIPE)
297 stdout, stderr = proc.communicate()
298
299 rabbitmqctl_cutoff = stdout[int(stdout.find('[')):int(stdout.rfind(']'))+1].replace('\n','')
300 return json.loads(_sanitize_rmqctl_output(rabbitmqctl_cutoff))
301
302
303def rabbitmq_check(target='I@rabbitmq:server', target_type='compound', ignore_dead=False, **kwargs):
304
305 ''' Verify rabbit cluster and it's alarms '''
306
307 agent = "RabbitMQ status"
308 out = __salt__['saltutil.cmd']( tgt=target,
309 tgt_type=target_type,
310 fun='health_checks.rabbitmq_cmd',
311 arg=['cluster_status'],
312 timeout=3
313 ) or None
314
315 if not _minions_output(out, agent, ignore_dead):
316 __context__['retcode'] = 2
317 return False
318
319 failed_minions = []
320
321 for minion in out:
322 rabbitmqctl_json = out[minion]['ret']
323 running_nodes = []
324 available_nodes = []
325 alarms = []
326 for el in rabbitmqctl_json:
327 if 'alarms' in el:
328 alarms = el['alarms']
329 if 'nodes' in el:
330 available_nodes = el['nodes'][0]['disc']
331 if 'running_nodes' in el:
332 running_nodes = el['running_nodes']
333
334 if running_nodes.sort() == available_nodes.sort():
335 nodes_alarms = []
336 for node in running_nodes:
337 for el in alarms:
338 if node in el:
339 if len(el[node]) > 0:
340 nodes_alarms.append(el[node])
341 if len(nodes_alarms) > 0:
342 failed_minions.append(minion)
343 else:
344 failed_minions.append(minion)
345
346 if not _failed_minions(out, agent, failed_minions):
347 __context__['retcode'] = 2
348 return False
349
350 if kwargs.get("debug", False):
351 logger.info(running_nodes)
352 return True
353
354
355def haproxy_status(socket_path='/run/haproxy/admin.sock', buff_size = 8192, encoding = 'UTF-8', stats_filter=[]):
356
357 ''' JSON formatted haproxy status '''
358
359 stat_cmd = 'show stat\n'
360
361 if not os.path.exists(socket_path):
362 logger.error('Socket %s does not exist or haproxy not running' % socket_path)
363 __context__['retcode'] = 2
364 return False
365
366 client = socket.socket( socket.AF_UNIX, socket.SOCK_STREAM)
367 client.connect(socket_path)
368 stat_cmd = 'show stat\n'
369
370 client.send(bytearray(stat_cmd, encoding))
371 output = client.recv(buff_size)
372
373 res = ""
374 while output:
375 res += output.decode(encoding)
376 output = client.recv(buff_size)
377 client.close()
378
379 haproxy_stats = {}
380 res_list = res.split('\n')
381 fields = res_list[0][2:].split(',')
382 stats_list = []
383 for line in res_list[1:]:
384 if len(line.strip()) > 0:
385 stats_list.append(line)
386
387 for i in range(len(stats_list)):
388 element = {}
389 for n in fields:
390 element[n] = stats_list[i].split(',')[fields.index(n)]
391 server_name = element.pop('pxname')
392 server_type = element.pop('svname')
393 if stats_filter:
394 filtered_element = element.copy()
395 for el in element:
396 if el not in stats_filter:
397 filtered_element.pop(el)
398 element = filtered_element
399 if server_name not in haproxy_stats:
400 haproxy_stats[server_name] = {}
401 if server_type == "FRONTEND" or server_type == "BACKEND":
402 haproxy_stats[server_name][server_type] = element
403 else:
404 if 'UPSTREAM' not in haproxy_stats[server_name]:
405 haproxy_stats[server_name]['UPSTREAM'] = {}
406 haproxy_stats[server_name]['UPSTREAM'][server_type] = element
407
408 return haproxy_stats
409
410
411def haproxy_check(target='I@haproxy:proxy', target_type='compound', ignore_dead=False, ignore_services=[], ignore_upstreams=[], ignore_no_upstream=False, **kwargs):
412
413 ''' Verify haproxy backends status '''
414
415 agent = "haproxy status"
416 out = __salt__['saltutil.cmd']( tgt=target,
417 tgt_type=target_type,
418 fun='health_checks.haproxy_status',
419 arg=["stats_filter=['status']"],
420 timeout=3
421 ) or None
422
423 if not _minions_output(out, agent, ignore_dead):
424 __context__['retcode'] = 2
425 return False
426
427 failed_minions = []
428 verified_minions = []
429 for minion in out:
430 verified_minions.append(minion)
431 haproxy_json = out[minion]['ret']
432 for service in haproxy_json:
433 if service not in ignore_services:
434 if haproxy_json[service]['FRONTEND']['status'] != 'OPEN':
435 if minion not in failed_minions:
436 failed_minions.append(minion)
437 if haproxy_json[service]['BACKEND']['status'] != 'UP':
438 if minion not in failed_minions:
439 failed_minions.append(minion)
440 if 'UPSTREAM' in haproxy_json[service]:
441 for upstream in haproxy_json[service]['UPSTREAM']:
442 if upstream not in ignore_upstreams:
443 if haproxy_json[service]['UPSTREAM'][upstream]['status'] != 'UP':
444 if minion not in failed_minions:
445 failed_minions.append(minion)
446 else:
447 if not ignore_no_upstream:
448 if minion not in failed_minions:
449 failed_minions.append(minion)
450
451 if not _failed_minions(out, agent, failed_minions):
452 __context__['retcode'] = 2
453 return False
454
455 if kwargs.get("debug", False):
456 logger.info(verified_minions)
457 return True
458
459
460def df_check(target='*', target_type='glob', verify='space', space_limit=80, inode_limit=80, ignore_dead=False, ignore_partitions=[], **kwargs):
461
462 ''' Verify storage space/inodes status '''
463
464 supported_options = ['space', 'inodes']
465 if verify not in supported_options:
466 logger.error('Unsupported "verify" option.')
467 logger.error('Supported options are: %s' % str(supported_options))
468 __context__['retcode'] = 2
469 return False
470
471 if verify == 'space':
472 fun_cmd = 'disk.usage'
473 json_arg = 'capacity'
474 limit = space_limit
475 elif verify == 'inodes':
476 fun_cmd = 'disk.inodeusage'
477 json_arg = 'use'
478 limit = inode_limit
479
480 agent = "df status"
481 out = __salt__['saltutil.cmd']( tgt=target,
482 tgt_type=target_type,
483 fun=fun_cmd,
484 timeout=3
485 ) or None
486
487 if not _minions_output(out, agent, ignore_dead):
488 __context__['retcode'] = 2
489 return False
490
491 failed_minions = []
492 verified_minions = []
493 for minion in out:
494 verified_minions.append(minion)
495 df_json = out[minion]['ret']
496 for disk in df_json:
497 if disk not in ignore_partitions:
498 if int(df_json[disk][json_arg][:-1]) > int(limit):
499 if minion not in failed_minions:
500 failed_minions.append(minion)
501
502 if not _failed_minions(out, agent, failed_minions):
503 __context__['retcode'] = 2
504 return False
505
506 if kwargs.get("debug", False):
507 logger.info(verified_minions)
508 return True
509
510
511def load_check(target='*', target_type='glob', la1=3, la5=3, la15=3, ignore_dead=False, **kwargs):
512
513 ''' Verify load average status '''
514
515 agent = "load average status"
516 out = __salt__['saltutil.cmd']( tgt=target,
517 tgt_type=target_type,
518 fun='status.loadavg',
519 timeout=3
520 ) or None
521
522 if not _minions_output(out, agent, ignore_dead):
523 __context__['retcode'] = 2
524 return False
525
526 failed_minions = []
527 verified_minions = []
528 for minion in out:
529 verified_minions.append(minion)
530 la_json = out[minion]['ret']
531 if float(la_json['1-min']) > float(la1):
532 if minion not in failed_minions:
533 failed_minions.append(minion)
534 if float(la_json['5-min']) > float(la5):
535 if minion not in failed_minions:
536 failed_minions.append(minion)
537 if float(la_json['15-min']) > float(la15):
538 if minion not in failed_minions:
539 failed_minions.append(minion)
540
541 if not _failed_minions(out, agent, failed_minions):
542 __context__['retcode'] = 2
543 return False
544
545 if kwargs.get("debug", False):
546 logger.info(verified_minions)
547 return True
548
549
550def netdev_check(target='*', target_type='glob', rx_drop_limit=0, tx_drop_limit=0, ignore_devices=[], ignore_dead=False, **kwargs):
551
552 ''' Verify netdev rx/tx drop status '''
553
554 agent = "netdev rx/tx status"
555 out = __salt__['saltutil.cmd']( tgt=target,
556 tgt_type=target_type,
557 fun='status.netdev',
558 timeout=3
559 ) or None
560
561 if not _minions_output(out, agent, ignore_dead):
562 __context__['retcode'] = 2
563 return False
564
565 failed_minions = []
566 verified_minions = []
567 for minion in out:
568 verified_minions.append(minion)
569 dev_json = out[minion]['ret']
570 for netdev in dev_json:
571 if netdev not in ignore_devices:
572 if int(dev_json[netdev]['rx_drop']) > int(rx_drop_limit):
573 if minion not in failed_minions:
574 failed_minions.append(minion)
575 if int(dev_json[netdev]['tx_drop']) > int(tx_drop_limit):
576 if minion not in failed_minions:
577 failed_minions.append(minion)
578
579 if not _failed_minions(out, agent, failed_minions):
580 __context__['retcode'] = 2
581 return False
582
583 if kwargs.get("debug", False):
584 logger.info(verified_minions)
585 return True
586
587
588def mem_check(target='*', target_type='glob', used_limit=80, ignore_dead=False, **kwargs):
589
590 ''' Verify available memory status '''
591
592 agent = "available memory status"
593 out = __salt__['saltutil.cmd']( tgt=target,
594 tgt_type=target_type,
595 fun='status.meminfo',
596 timeout=3
597 ) or None
598
599 if not _minions_output(out, agent, ignore_dead):
600 __context__['retcode'] = 2
601 return False
602
603 failed_minions = []
604 verified_minions = []
605 for minion in out:
606 mem_avail = int(out[minion]['ret']['MemAvailable']['value'])
607 mem_total = int(out[minion]['ret']['MemTotal']['value'])
608 used_pct = float((mem_total - mem_avail) * 100 / mem_total)
609 if used_pct > float(used_limit):
610 if minion not in failed_minions:
611 failed_minions.append(minion)
612 else:
613 verified_minions.append( { minion : str(used_pct) + '%' } )
614
615 if not _failed_minions(out, agent, failed_minions):
616 __context__['retcode'] = 2
617 return False
618
619 if kwargs.get("debug", False):
620 logger.info(verified_minions)
621 return True
622
623
624def ntp_status(params = ['-4', '-p', '-n']):
625
626 ''' JSON formatted ntpq command output '''
627
628 ntp_states = [
629 { 'indicator': '#', 'comment': 'source selected, distance exceeds maximum value' },
630 { 'indicator': 'o', 'comment': 'source selected, Pulse Per Second (PPS) used' },
631 { 'indicator': '+', 'comment': 'source selected, included in final set' },
632 { 'indicator': 'x', 'comment': 'source false ticker' },
633 { 'indicator': '.', 'comment': 'source selected from end of candidate list' },
634 { 'indicator': '-', 'comment': 'source discarded by cluster algorithm' },
635 { 'indicator': '*', 'comment': 'current time source' },
636 { 'indicator': ' ', 'comment': 'source discarded high stratum, failed sanity' }
637 ]
638 ntp_state_indicators = []
639 for state in ntp_states:
640 ntp_state_indicators.append(state['indicator'])
641 source_types = {}
642 source_types['l'] = "local (such as a GPS, WWVB)"
643 source_types['u'] = "unicast (most common)"
644 source_types['m'] = "multicast"
645 source_types['b'] = "broadcast"
646 source_types['-'] = "netaddr"
647
648 proc = subprocess.Popen(['ntpq'] + params, stdout=subprocess.PIPE)
649 stdout, stderr = proc.communicate()
650
651 ntp_lines = stdout.split('\n')
652 fields = re.sub("\s+", " ", ntp_lines[0]).split()
653 fields[fields.index('st')] = 'stratum'
654 fields[fields.index('t')] = 'source_type'
655
656 ntp_peers = {}
657 for line in ntp_lines[2:]:
658 if len(line.strip()) > 0:
659 element = {}
660 values = re.sub("\s+", " ", line).split()
661 for i in range(len(values)):
662 if fields[i] == 'source_type':
663 element[fields[i]] = { 'indicator': values[i], 'comment': source_types[values[i]] }
664 elif fields[i] in ['stratum', 'when', 'poll', 'reach']:
665 if values[i] == '-':
666 element[fields[i]] = int(-1)
667 else:
668 element[fields[i]] = int(values[i])
669 elif fields[i] in ['delay', 'offset', 'jitter']:
670 element[fields[i]] = float(values[i])
671 else:
672 element[fields[i]] = values[i]
673 peer = element.pop('remote')
674 peer_state = peer[0]
675 if peer_state in ntp_state_indicators:
676 peer = peer[1:]
677 else:
678 peer_state = 'f'
679 element['current'] = False
680 if peer_state == '*':
681 element['current'] = True
682 for state in ntp_states:
683 if state['indicator'] == peer_state:
684 element['state'] = state.copy()
685 if peer_state == 'f' and state['indicator'] == ' ':
686 fail_state = state.copy()
687 fail_state.pop('indicator')
688 fail_state['indicator'] = 'f'
689 element['state'] = fail_state
690 ntp_peers[peer] = element
691
692 return ntp_peers
693
694
695def ntp_check(min_peers=1, max_stratum=3, target='*', target_type='glob', ignore_dead=False, **kwargs):
696
697 ''' Verify NTP peers status '''
698
699 agent = "ntpd peers status"
700 out = __salt__['saltutil.cmd']( tgt=target,
701 tgt_type=target_type,
702 fun='health_checks.ntp_status',
703 timeout=3
704 ) or None
705
706 if not _minions_output(out, agent, ignore_dead):
707 __context__['retcode'] = 2
708 return False
709
710 failed_minions = []
711 verified_minions = []
712 for minion in out:
713 ntp_json = out[minion]['ret']
714 good_peers = []
715 for peer in ntp_json:
716 if ntp_json[peer]['stratum'] < int(max_stratum) + 1:
717 good_peers.append(peer)
718 if len(good_peers) > int(min_peers) - 1:
719 if minion not in verified_minions:
720 verified_minions.append(minion)
721 else:
722 if minion not in failed_minions:
723 failed_minions.append(minion)
724
725 if not _failed_minions(out, agent, failed_minions):
726 __context__['retcode'] = 2
727 return False
728
729 if kwargs.get("debug", False):
730 logger.info(verified_minions)
731 return True