Blame - _modules/health_checks.py - salt-formulas/salt

blob: 4245d036ea17c050d11a582f6b2a360929aa463a [file] [log] [blame]

Dzmitry Stremkouski	f1bcbb5	2019-04-11 15:48:24 +0200	[diff] [blame^]	1	import requests
Dzmitry Stremkouski	b71ada9	2019-04-05 22:37:59 +0200	[diff] [blame]	2	import subprocess
				3	import socket
				4	import salt.utils
				5	import logging
				6	import os
				7	import re
				8	import json
				9
				10	__author__ = "Dzmitry Stremkouski"
				11	__copyright__ = "Copyright 2019, Mirantis Inc."
				12	__license__ = "Apache 2.0"
				13
				14	logger = logging.getLogger(__name__)
				15	stream = logging.StreamHandler()
				16	logger.addHandler(stream)
				17
				18
				19	def _failed_minions(out, agent, failed_minions):
				20
				21	''' Verify failed minions '''
				22
				23	if len(failed_minions) > 0:
				24	logger.error("%s check FAILED" % agent)
				25	logger.error("Some minions returned non-zero exit code or empty data")
				26	logger.error("Failed minions:" + str(failed_minions))
				27	for minion in failed_minions:
				28	logger.error(minion)
				29	logger.debug(str(out[minion]['ret']))
				30	__context__['retcode'] = 2
				31	return False
				32
				33	return True
				34
				35
				36	def _minions_output(out, agent, ignore_dead, ignore_empty=False):
				37
				38	''' Verify minions output and exit code '''
				39
				40	if not out:
				41	logger.error("%s check FAILED" % agent)
				42	logger.error("No response from master cmd")
				43	__context__['retcode'] = 2
				44	return False
				45
				46	if not ignore_dead:
				47	jid = out.itervalues().next()['jid']
				48	job_stats = __salt__['saltutil.runner']( 'jobs.print_job', arg=[jid] ) or None
				49	if not job_stats:
				50	logger.error("%s check FAILED" % agent)
				51	logger.error("No response from master runner")
				52	__context__['retcode'] = 2
				53	return False
				54
				55	job_result = job_stats[jid]['Result']
				56	job_minions = job_stats[jid]['Minions']
				57	if len(job_minions) != len(job_result):
				58	logger.error("%s check FAILED" % agent)
				59	logger.error("Some minions are offline")
				60	logger.error(list(set(job_minions) - set(job_result.keys())))
				61	__context__['retcode'] = 2
				62	return False
				63
				64	failed_minions = []
				65	for minion in out:
				66	if 'retcode' in out[minion]:
				67	if out[minion]['retcode'] == 0:
				68	if not ignore_empty:
				69	if isinstance(out[minion]['ret'], bool):
				70	if minion not in failed_minions:
				71	failed_minions.append(minion)
				72	elif len(out[minion]['ret']) == 0:
				73	if minion not in failed_minions:
				74	failed_minions.append(minion)
				75	else:
				76	if minion not in failed_minions:
				77	failed_minions.append(minion)
				78	else:
				79	if minion not in failed_minions:
				80	failed_minions.append(minion)
				81
				82	if not _failed_minions(out, agent, failed_minions):
				83	__context__['retcode'] = 2
				84	return False
				85
				86	return True
				87
				88
				89	def minions_check(wait_timeout=1, gather_job_wait_timeout=1, target='*', target_type='glob', ignore_dead=False):
				90
				91	''' Verify minions are online '''
				92
				93	agent = "Minions"
				94	out = __salt__['saltutil.cmd']( tgt=target,
				95	tgt_type=target_type,
				96	fun='test.ping',
				97	timeout=wait_timeout,
				98	gather_job_timeout=gather_job_wait_timeout
				99	) or None
				100
				101	return _minions_output(out, agent, ignore_dead, ignore_empty=True)
				102
				103
				104	def time_diff_check(time_diff=1, target='', target_type='glob', ignore_dead=False, *kwargs):
				105
				106	''' Verify time diff on servers '''
				107
				108	agent = "Time diff"
				109	out = __salt__['saltutil.cmd']( tgt=target,
				110	tgt_type=target_type,
				111	fun='status.time',
				112	arg=['%s'],
				113	timeout=3
				114	) or None
				115
				116	if not _minions_output(out, agent, ignore_dead):
				117	__context__['retcode'] = 2
				118	return False
				119
				120	minions_times = {}
				121	env_times = []
				122	verified_minions = []
				123
				124	for minion in out:
				125	verified_minions.append(minion)
				126	if out[minion]['retcode'] == 0:
				127	minion_time = int(out[minion]['ret'])
				128	if str(minion_time) not in minions_times:
				129	minions_times[str(minion_time)] = []
				130	minions_times[str(minion_time)].append(minion)
				131	env_times.append(minion_time)
				132
				133	env_times.sort()
				134	diff = env_times[-1] - env_times[0]
				135
				136	if diff > time_diff:
				137	__context__['retcode'] = 2
				138	if kwargs.get("debug", False):
				139	return False, minions_times
				140	else:
				141	return False
				142
				143	if kwargs.get("debug", False):
				144	logger.info(verified_minions)
				145	return True
				146
				147
				148	def contrail_check(target='I@contrail:control or I@contrail:collector or I@opencontrail:compute or I@opencontrail:client', target_type='compound', ignore_dead=False, **kwargs):
				149
				150	''' Verify contrail status returns nothing critical '''
				151
				152	agent = "Contrail status"
				153	out = __salt__['saltutil.cmd']( tgt=target,
				154	tgt_type=target_type,
				155	fun='cmd.run',
				156	arg=['contrail-status'],
				157	timeout=5
				158	) or None
				159
				160	if not _minions_output(out, agent, ignore_dead):
				161	__context__['retcode'] = 2
				162	return False
				163
				164	failed_minions = []
				165	pattern = '^(==\|$\|\S+\s+(active\|backup\|inactive\s$disabled\son\sboot$))'
				166	prog = re.compile(pattern)
				167
				168	validated = []
				169	for minion in out:
				170	for line in out[minion]['ret'].split('\n'):
				171	if not prog.match(line) and minion not in failed_minions:
				172	failed_minions.append(minion)
				173	validated.append(minion)
				174
				175	if not _failed_minions(out, agent, failed_minions):
				176	__context__['retcode'] = 2
				177	return False
				178
				179	if kwargs.get("debug", False):
				180	logger.info(validated)
				181	return True
				182
				183
				184	def galera_check(cluster_size=3, target='I@galera:master or I@galera:slave', target_type='compound', ignore_dead=False, **kwargs):
				185
				186	''' Verify galera cluster size and state '''
				187
				188	agent = "Galera status"
				189	out = __salt__['saltutil.cmd']( tgt=target,
				190	tgt_type=target_type,
				191	fun='mysql.status',
				192	timeout=3
				193	) or None
				194
				195	if not _minions_output(out, agent, ignore_dead):
				196	__context__['retcode'] = 2
				197	return False
				198
				199	failed_minions = []
				200
				201	validated = []
				202	for minion in out:
				203	if int(out[minion]['ret']['wsrep_cluster_size']) != int(cluster_size) and minion not in failed_minions:
				204	failed_minions.append(minion)
				205	if out[minion]['ret']['wsrep_evs_state'] != 'OPERATIONAL' and minion not in failed_minions:
				206	failed_minions.append(minion)
				207	validated.append(minion)
				208
				209	if not _failed_minions(out, agent, failed_minions):
				210	__context__['retcode'] = 2
				211	return False
				212
				213	if kwargs.get("debug", False):
				214	logger.info(validated)
				215	logger.info("Cluster size: " + str(out[validated[0]]['ret']['wsrep_cluster_size']))
				216	logger.info("Cluster state: " + str(out[validated[0]]['ret']['wsrep_evs_state']))
				217	return True
				218
				219
				220	def _quote_str(s, l=False, r=False):
				221
				222	''' Quting rabbitmq erl objects for json import '''
				223
				224	if len(s) > 0:
				225	if l:
				226	s = s.lstrip()
				227	if r:
				228	s = s.rstrip()
				229	if (s[0] == "'") and (s[-1] != "'") and r and not l:
				230	s += "'"
				231	if (s[0] == '"') and (s[-1] != '"') and r and not l:
				232	s += '"'
				233	if (s[-1] == "'") and (s[0] != "'") and l and not r:
				234	s = "'" + s
				235	if (s[-1] == '"') and (s[0] != '"') and l and not r:
				236	s = '"' + s
				237	if (s[-1] != "'") and (s[-1] != '"') and (s[0] != "'") and (s[0] != '"'):
				238	s = '"' + s.replace('"', '\\\"') + '"'
				239	else:
				240	if (not l) and (not r) and s[0] != '"' and not s[-1] != '"':
				241	s= s.replace('"', '\\\"')
				242	return s.replace("'", '"')
				243	else:
				244	return s
				245
				246
				247	def _sanitize_rmqctl_output(string):
				248
				249	''' Sanitizing rabbitmq erl objects for json import '''
				250
				251	rabbitctl_json = ""
				252	for line in string.split(','):
				253	copy = line
				254	left = ""
				255	right = ""
				256	mid = copy
				257	lpar = False
				258	rpar = False
				259	if re.search('([\[\{\s]+)(.*)', copy):
				260	mid = re.sub('^([\[\{\s]+)','', copy)
				261	left = copy[:-len(mid)]
				262	copy = mid
				263	lpar = True
				264	if re.search('(.*)([\]\}\s]+)$', copy):
				265	mid = re.sub('([\]\}\s]+)$','', copy)
				266	right = copy[len(mid):]
				267	copy = mid
				268	rpar = True
				269	result = left + _quote_str(mid, l=lpar, r=rpar) + right
				270	if (not rpar) and lpar and (len(left.strip()) > 0) and (left.strip()[-1] == '{'):
				271	result += ":"
				272	else:
				273	result += ","
				274	rabbitctl_json += result
				275
				276	rabbitctl_json = rabbitctl_json[:-1]
				277	new_rabbitctl_json = rabbitctl_json
				278	for s in re.findall('"[^:\[{\]}]+"\s:\s("[^\[{\]}]+")', rabbitctl_json):
				279	if '"' in s[1:][:-1]:
				280	orig = s
				281	changed = '"' + s.replace('\\', '\\\\').replace('"', '\\\"') + '"'
				282	new_rabbitctl_json = new_rabbitctl_json.replace(orig, changed)
				283	return new_rabbitctl_json
				284
				285
Dzmitry Stremkouski	f1bcbb5	2019-04-11 15:48:24 +0200	[diff] [blame^]	286	def rabbitmq_list_queues(vhost='/'):
				287
				288	''' JSON formatted RabbitMQ queues list '''
				289
				290	proc = subprocess.Popen(['rabbitmqctl', 'list_queues' , '-p', vhost], stdout=subprocess.PIPE)
				291	stdout, stderr = proc.communicate()
				292
				293	queues = {}
				294	for line in stdout.split('\n'):
				295	if re.findall('[0-9]$', line):
				296	queue_name, num = re.sub(r"\s+", " ", line).split()
				297	queues[queue_name] = int(num)
				298
				299	return queues
				300
				301
				302	def rabbitmq_list_vhosts():
				303
				304	''' JSON formatted RabbitMQ vhosts list '''
				305
				306	proc = subprocess.Popen(['rabbitmqctl', 'list_vhosts'], stdout=subprocess.PIPE)
				307	stdout, stderr = proc.communicate()
				308
				309	vhosts = []
				310	for line in stdout.split('\n'):
				311	if re.findall('^/', line):
				312	vhosts.append(line)
				313
				314	return vhosts
				315
				316
Dzmitry Stremkouski	b71ada9	2019-04-05 22:37:59 +0200	[diff] [blame]	317	def rabbitmq_cmd(cmd):
				318
				319	''' JSON formatted RabbitMQ command output '''
				320
				321	supported_commands = ['status', 'cluster_status', 'list_hashes', 'list_ciphers']
				322	if cmd not in supported_commands:
				323	logger.error("Command is not supported yet, sorry")
				324	logger.error("Supported commands are: " + str(supported_commands))
				325	__context__['retcode'] = 2
				326	return False
				327
				328	proc = subprocess.Popen(['rabbitmqctl', cmd], stdout=subprocess.PIPE)
				329	stdout, stderr = proc.communicate()
				330
				331	rabbitmqctl_cutoff = stdout[int(stdout.find('[')):int(stdout.rfind(']'))+1].replace('\n','')
				332	return json.loads(_sanitize_rmqctl_output(rabbitmqctl_cutoff))
				333
				334
				335	def rabbitmq_check(target='I@rabbitmq:server', target_type='compound', ignore_dead=False, **kwargs):
				336
				337	''' Verify rabbit cluster and it's alarms '''
				338
				339	agent = "RabbitMQ status"
				340	out = __salt__['saltutil.cmd']( tgt=target,
				341	tgt_type=target_type,
				342	fun='health_checks.rabbitmq_cmd',
				343	arg=['cluster_status'],
				344	timeout=3
				345	) or None
				346
				347	if not _minions_output(out, agent, ignore_dead):
				348	__context__['retcode'] = 2
				349	return False
				350
				351	failed_minions = []
				352
				353	for minion in out:
				354	rabbitmqctl_json = out[minion]['ret']
				355	running_nodes = []
				356	available_nodes = []
				357	alarms = []
				358	for el in rabbitmqctl_json:
				359	if 'alarms' in el:
				360	alarms = el['alarms']
				361	if 'nodes' in el:
				362	available_nodes = el['nodes'][0]['disc']
				363	if 'running_nodes' in el:
				364	running_nodes = el['running_nodes']
				365
				366	if running_nodes.sort() == available_nodes.sort():
				367	nodes_alarms = []
				368	for node in running_nodes:
				369	for el in alarms:
				370	if node in el:
				371	if len(el[node]) > 0:
				372	nodes_alarms.append(el[node])
				373	if len(nodes_alarms) > 0:
				374	failed_minions.append(minion)
				375	else:
				376	failed_minions.append(minion)
				377
				378	if not _failed_minions(out, agent, failed_minions):
				379	__context__['retcode'] = 2
				380	return False
				381
				382	if kwargs.get("debug", False):
				383	logger.info(running_nodes)
				384	return True
				385
				386
				387	def haproxy_status(socket_path='/run/haproxy/admin.sock', buff_size = 8192, encoding = 'UTF-8', stats_filter=[]):
				388
				389	''' JSON formatted haproxy status '''
				390
				391	stat_cmd = 'show stat\n'
				392
				393	if not os.path.exists(socket_path):
				394	logger.error('Socket %s does not exist or haproxy not running' % socket_path)
				395	__context__['retcode'] = 2
				396	return False
				397
				398	client = socket.socket( socket.AF_UNIX, socket.SOCK_STREAM)
				399	client.connect(socket_path)
				400	stat_cmd = 'show stat\n'
				401
				402	client.send(bytearray(stat_cmd, encoding))
				403	output = client.recv(buff_size)
				404
				405	res = ""
				406	while output:
				407	res += output.decode(encoding)
				408	output = client.recv(buff_size)
				409	client.close()
				410
				411	haproxy_stats = {}
				412	res_list = res.split('\n')
				413	fields = res_list[0][2:].split(',')
				414	stats_list = []
				415	for line in res_list[1:]:
				416	if len(line.strip()) > 0:
				417	stats_list.append(line)
				418
				419	for i in range(len(stats_list)):
				420	element = {}
				421	for n in fields:
				422	element[n] = stats_list[i].split(',')[fields.index(n)]
				423	server_name = element.pop('pxname')
				424	server_type = element.pop('svname')
				425	if stats_filter:
				426	filtered_element = element.copy()
				427	for el in element:
				428	if el not in stats_filter:
				429	filtered_element.pop(el)
				430	element = filtered_element
				431	if server_name not in haproxy_stats:
				432	haproxy_stats[server_name] = {}
				433	if server_type == "FRONTEND" or server_type == "BACKEND":
				434	haproxy_stats[server_name][server_type] = element
				435	else:
				436	if 'UPSTREAM' not in haproxy_stats[server_name]:
				437	haproxy_stats[server_name]['UPSTREAM'] = {}
				438	haproxy_stats[server_name]['UPSTREAM'][server_type] = element
				439
				440	return haproxy_stats
				441
				442
				443	def haproxy_check(target='I@haproxy:proxy', target_type='compound', ignore_dead=False, ignore_services=[], ignore_upstreams=[], ignore_no_upstream=False, **kwargs):
				444
				445	''' Verify haproxy backends status '''
				446
				447	agent = "haproxy status"
				448	out = __salt__['saltutil.cmd']( tgt=target,
				449	tgt_type=target_type,
				450	fun='health_checks.haproxy_status',
				451	arg=["stats_filter=['status']"],
				452	timeout=3
				453	) or None
				454
				455	if not _minions_output(out, agent, ignore_dead):
				456	__context__['retcode'] = 2
				457	return False
				458
				459	failed_minions = []
				460	verified_minions = []
				461	for minion in out:
				462	verified_minions.append(minion)
				463	haproxy_json = out[minion]['ret']
				464	for service in haproxy_json:
				465	if service not in ignore_services:
				466	if haproxy_json[service]['FRONTEND']['status'] != 'OPEN':
				467	if minion not in failed_minions:
				468	failed_minions.append(minion)
				469	if haproxy_json[service]['BACKEND']['status'] != 'UP':
				470	if minion not in failed_minions:
				471	failed_minions.append(minion)
				472	if 'UPSTREAM' in haproxy_json[service]:
				473	for upstream in haproxy_json[service]['UPSTREAM']:
				474	if upstream not in ignore_upstreams:
				475	if haproxy_json[service]['UPSTREAM'][upstream]['status'] != 'UP':
				476	if minion not in failed_minions:
				477	failed_minions.append(minion)
				478	else:
				479	if not ignore_no_upstream:
				480	if minion not in failed_minions:
				481	failed_minions.append(minion)
				482
				483	if not _failed_minions(out, agent, failed_minions):
				484	__context__['retcode'] = 2
				485	return False
				486
				487	if kwargs.get("debug", False):
				488	logger.info(verified_minions)
				489	return True
				490
				491
				492	def df_check(target='', target_type='glob', verify='space', space_limit=80, inode_limit=80, ignore_dead=False, ignore_partitions=[], *kwargs):
				493
				494	''' Verify storage space/inodes status '''
				495
				496	supported_options = ['space', 'inodes']
				497	if verify not in supported_options:
				498	logger.error('Unsupported "verify" option.')
				499	logger.error('Supported options are: %s' % str(supported_options))
				500	__context__['retcode'] = 2
				501	return False
				502
				503	if verify == 'space':
				504	fun_cmd = 'disk.usage'
				505	json_arg = 'capacity'
				506	limit = space_limit
				507	elif verify == 'inodes':
				508	fun_cmd = 'disk.inodeusage'
				509	json_arg = 'use'
				510	limit = inode_limit
				511
				512	agent = "df status"
				513	out = __salt__['saltutil.cmd']( tgt=target,
				514	tgt_type=target_type,
				515	fun=fun_cmd,
				516	timeout=3
				517	) or None
				518
				519	if not _minions_output(out, agent, ignore_dead):
				520	__context__['retcode'] = 2
				521	return False
				522
				523	failed_minions = []
				524	verified_minions = []
				525	for minion in out:
				526	verified_minions.append(minion)
				527	df_json = out[minion]['ret']
				528	for disk in df_json:
				529	if disk not in ignore_partitions:
				530	if int(df_json[disk][json_arg][:-1]) > int(limit):
				531	if minion not in failed_minions:
				532	failed_minions.append(minion)
				533
				534	if not _failed_minions(out, agent, failed_minions):
				535	__context__['retcode'] = 2
				536	return False
				537
				538	if kwargs.get("debug", False):
				539	logger.info(verified_minions)
				540	return True
				541
				542
				543	def load_check(target='', target_type='glob', la1=3, la5=3, la15=3, ignore_dead=False, *kwargs):
				544
				545	''' Verify load average status '''
				546
				547	agent = "load average status"
				548	out = __salt__['saltutil.cmd']( tgt=target,
				549	tgt_type=target_type,
				550	fun='status.loadavg',
				551	timeout=3
				552	) or None
				553
				554	if not _minions_output(out, agent, ignore_dead):
				555	__context__['retcode'] = 2
				556	return False
				557
				558	failed_minions = []
				559	verified_minions = []
				560	for minion in out:
				561	verified_minions.append(minion)
				562	la_json = out[minion]['ret']
				563	if float(la_json['1-min']) > float(la1):
				564	if minion not in failed_minions:
				565	failed_minions.append(minion)
				566	if float(la_json['5-min']) > float(la5):
				567	if minion not in failed_minions:
				568	failed_minions.append(minion)
				569	if float(la_json['15-min']) > float(la15):
				570	if minion not in failed_minions:
				571	failed_minions.append(minion)
				572
				573	if not _failed_minions(out, agent, failed_minions):
				574	__context__['retcode'] = 2
				575	return False
				576
				577	if kwargs.get("debug", False):
				578	logger.info(verified_minions)
				579	return True
				580
				581
				582	def netdev_check(target='', target_type='glob', rx_drop_limit=0, tx_drop_limit=0, ignore_devices=[], ignore_dead=False, *kwargs):
				583
				584	''' Verify netdev rx/tx drop status '''
				585
				586	agent = "netdev rx/tx status"
				587	out = __salt__['saltutil.cmd']( tgt=target,
				588	tgt_type=target_type,
				589	fun='status.netdev',
				590	timeout=3
				591	) or None
				592
				593	if not _minions_output(out, agent, ignore_dead):
				594	__context__['retcode'] = 2
				595	return False
				596
				597	failed_minions = []
				598	verified_minions = []
				599	for minion in out:
				600	verified_minions.append(minion)
				601	dev_json = out[minion]['ret']
				602	for netdev in dev_json:
				603	if netdev not in ignore_devices:
				604	if int(dev_json[netdev]['rx_drop']) > int(rx_drop_limit):
				605	if minion not in failed_minions:
				606	failed_minions.append(minion)
				607	if int(dev_json[netdev]['tx_drop']) > int(tx_drop_limit):
				608	if minion not in failed_minions:
				609	failed_minions.append(minion)
				610
				611	if not _failed_minions(out, agent, failed_minions):
				612	__context__['retcode'] = 2
				613	return False
				614
				615	if kwargs.get("debug", False):
				616	logger.info(verified_minions)
				617	return True
				618
				619
				620	def mem_check(target='', target_type='glob', used_limit=80, ignore_dead=False, *kwargs):
				621
				622	''' Verify available memory status '''
				623
				624	agent = "available memory status"
				625	out = __salt__['saltutil.cmd']( tgt=target,
				626	tgt_type=target_type,
				627	fun='status.meminfo',
				628	timeout=3
				629	) or None
				630
				631	if not _minions_output(out, agent, ignore_dead):
				632	__context__['retcode'] = 2
				633	return False
				634
				635	failed_minions = []
				636	verified_minions = []
				637	for minion in out:
				638	mem_avail = int(out[minion]['ret']['MemAvailable']['value'])
				639	mem_total = int(out[minion]['ret']['MemTotal']['value'])
				640	used_pct = float((mem_total - mem_avail) * 100 / mem_total)
				641	if used_pct > float(used_limit):
				642	if minion not in failed_minions:
				643	failed_minions.append(minion)
				644	else:
				645	verified_minions.append( { minion : str(used_pct) + '%' } )
				646
				647	if not _failed_minions(out, agent, failed_minions):
				648	__context__['retcode'] = 2
				649	return False
				650
				651	if kwargs.get("debug", False):
				652	logger.info(verified_minions)
				653	return True
				654
				655
				656	def ntp_status(params = ['-4', '-p', '-n']):
				657
				658	''' JSON formatted ntpq command output '''
				659
				660	ntp_states = [
				661	{ 'indicator': '#', 'comment': 'source selected, distance exceeds maximum value' },
				662	{ 'indicator': 'o', 'comment': 'source selected, Pulse Per Second (PPS) used' },
				663	{ 'indicator': '+', 'comment': 'source selected, included in final set' },
				664	{ 'indicator': 'x', 'comment': 'source false ticker' },
				665	{ 'indicator': '.', 'comment': 'source selected from end of candidate list' },
				666	{ 'indicator': '-', 'comment': 'source discarded by cluster algorithm' },
				667	{ 'indicator': '*', 'comment': 'current time source' },
				668	{ 'indicator': ' ', 'comment': 'source discarded high stratum, failed sanity' }
				669	]
				670	ntp_state_indicators = []
				671	for state in ntp_states:
				672	ntp_state_indicators.append(state['indicator'])
				673	source_types = {}
				674	source_types['l'] = "local (such as a GPS, WWVB)"
				675	source_types['u'] = "unicast (most common)"
				676	source_types['m'] = "multicast"
				677	source_types['b'] = "broadcast"
				678	source_types['-'] = "netaddr"
				679
				680	proc = subprocess.Popen(['ntpq'] + params, stdout=subprocess.PIPE)
				681	stdout, stderr = proc.communicate()
				682
				683	ntp_lines = stdout.split('\n')
				684	fields = re.sub("\s+", " ", ntp_lines[0]).split()
				685	fields[fields.index('st')] = 'stratum'
				686	fields[fields.index('t')] = 'source_type'
				687
				688	ntp_peers = {}
				689	for line in ntp_lines[2:]:
				690	if len(line.strip()) > 0:
				691	element = {}
				692	values = re.sub("\s+", " ", line).split()
				693	for i in range(len(values)):
				694	if fields[i] == 'source_type':
				695	element[fields[i]] = { 'indicator': values[i], 'comment': source_types[values[i]] }
				696	elif fields[i] in ['stratum', 'when', 'poll', 'reach']:
				697	if values[i] == '-':
				698	element[fields[i]] = int(-1)
				699	else:
				700	element[fields[i]] = int(values[i])
				701	elif fields[i] in ['delay', 'offset', 'jitter']:
				702	element[fields[i]] = float(values[i])
				703	else:
				704	element[fields[i]] = values[i]
				705	peer = element.pop('remote')
				706	peer_state = peer[0]
				707	if peer_state in ntp_state_indicators:
				708	peer = peer[1:]
				709	else:
				710	peer_state = 'f'
				711	element['current'] = False
				712	if peer_state == '*':
				713	element['current'] = True
				714	for state in ntp_states:
				715	if state['indicator'] == peer_state:
				716	element['state'] = state.copy()
				717	if peer_state == 'f' and state['indicator'] == ' ':
				718	fail_state = state.copy()
				719	fail_state.pop('indicator')
				720	fail_state['indicator'] = 'f'
				721	element['state'] = fail_state
				722	ntp_peers[peer] = element
				723
				724	return ntp_peers
				725
				726
				727	def ntp_check(min_peers=1, max_stratum=3, target='', target_type='glob', ignore_dead=False, *kwargs):
				728
				729	''' Verify NTP peers status '''
				730
				731	agent = "ntpd peers status"
				732	out = __salt__['saltutil.cmd']( tgt=target,
				733	tgt_type=target_type,
				734	fun='health_checks.ntp_status',
				735	timeout=3
				736	) or None
				737
				738	if not _minions_output(out, agent, ignore_dead):
				739	__context__['retcode'] = 2
				740	return False
				741
				742	failed_minions = []
				743	verified_minions = []
				744	for minion in out:
				745	ntp_json = out[minion]['ret']
				746	good_peers = []
				747	for peer in ntp_json:
				748	if ntp_json[peer]['stratum'] < int(max_stratum) + 1:
				749	good_peers.append(peer)
				750	if len(good_peers) > int(min_peers) - 1:
				751	if minion not in verified_minions:
				752	verified_minions.append(minion)
				753	else:
				754	if minion not in failed_minions:
				755	failed_minions.append(minion)
				756
				757	if not _failed_minions(out, agent, failed_minions):
				758	__context__['retcode'] = 2
				759	return False
				760
				761	if kwargs.get("debug", False):
				762	logger.info(verified_minions)
Dzmitry Stremkouski	f1bcbb5	2019-04-11 15:48:24 +0200	[diff] [blame^]	763
Dzmitry Stremkouski	b71ada9	2019-04-05 22:37:59 +0200	[diff] [blame]	764	return True
Dzmitry Stremkouski	f1bcbb5	2019-04-11 15:48:24 +0200	[diff] [blame^]	765
				766
				767	def gluster_pool_list():
				768
				769	''' JSON formatted GlusterFS pool list command output '''
				770
				771	proc = subprocess.Popen(['gluster', 'pool', 'list'], stdout=subprocess.PIPE)
				772	stdout, stderr = proc.communicate()
				773
				774	regex = re.compile('^(\S+)\s+(\S+)\s+(\S+)$')
				775	fields = regex.findall(stdout.split('\n')[0])[0]
				776
				777	pool = {}
				778
				779	for line in stdout.split('\n')[1:]:
				780	if len(line.strip()) > 0:
				781	peer = {}
				782	values = regex.findall(line.strip())[0]
				783	for i in range(len(fields)):
				784	peer[fields[i].lower()] = values[i]
				785	uuid = peer.pop('uuid')
				786	pool[uuid] = peer
				787
				788	return pool
				789
				790
				791	def gluster_volume_status():
				792
				793	''' JSON formatted GlusterFS volumes status command output '''
				794
				795	proc = subprocess.Popen(['gluster', 'volume', 'status', 'all', 'detail'], stdout=subprocess.PIPE)
				796	stdout, stderr = proc.communicate()
				797
				798	begin_volume = False
				799	brick_lookup = False
				800	volumes = {}
				801	volume_name = ""
				802
				803	for line in stdout.split('\n'):
				804	if 'Status of volume' in line:
				805	volume_name = line.split(':')[1].strip()
				806	volumes[volume_name] = { 'bricks': [] }
				807	begin_volume = True
				808	elif len(line.strip()) == 0:
				809	if begin_volume:
				810	begin_volume = False
				811	elif '--------' in line:
				812	brick_lookup = True
				813	elif brick_lookup and line.split(':')[0].strip() == 'Brick':
				814	brick_host, brick_path = re.findall('^Brick\ :\ (.)', line)[0].split()[1].split(':')
				815	volumes[volume_name]['bricks'].append({ 'host': brick_host, 'path': brick_path })
				816	brick_lookup = False
				817	else:
				818	brick_key, brick_value = line.split(':')
				819	brick_key = brick_key.strip().lower().replace(' ', '_')
				820	brick_value = brick_value.strip()
				821	volumes[volume_name]['bricks'][len(volumes[volume_name]['bricks']) - 1][brick_key] = brick_value
				822
				823	return volumes
				824
				825
				826	def gluster_pool_check(target='I@glusterfs:server', target_type='compound', expected_size=3, ignore_dead=False, **kwargs):
				827
				828	''' Check GlusterFS peer status '''
				829
				830	agent = "glusterfs peer status"
				831	out = __salt__['saltutil.cmd']( tgt=target,
				832	tgt_type=target_type,
				833	fun='health_checks.gluster_pool_list',
				834	timeout=3,
				835	kwargs='[batch=True]'
				836	) or None
				837
				838	if not _minions_output(out, agent, ignore_dead):
				839	__context__['retcode'] = 2
				840	return False
				841
				842	failed_minions = []
				843	verified_minions = []
				844	for minion in out:
				845	verified_minions.append(minion)
				846	gluster_json = out[minion]['ret']
				847	alive_peers = []
				848	for peer in gluster_json:
				849	if gluster_json[peer]['state'] == 'Connected':
				850	alive_peers.append(peer)
				851	else:
				852	if minion not in failed_minions:
				853	failed_minions.append(minion)
				854	if len(alive_peers) < expected_size:
				855	if minion not in failed_minions:
				856	failed_minions.append(minion)
				857
				858	if not _failed_minions(out, agent, failed_minions):
				859	__context__['retcode'] = 2
				860	return False
				861
				862	if kwargs.get("debug", False):
				863	logger.info(verified_minions)
				864
				865	return True
				866
				867
				868	def gluster_volumes_check(target='I@glusterfs:server', target_type='compound', expected_size=3, ignore_volumes=[], ignore_dead=False, **kwargs):
				869
				870	''' Check GlusterFS volumes status '''
				871
				872	agent = "glusterfs volumes status"
				873	out = __salt__['saltutil.cmd']( tgt=target,
				874	tgt_type=target_type,
				875	fun='health_checks.gluster_volume_status',
				876	timeout=3,
				877	kwargs='[batch=True]'
				878	) or None
				879
				880	if not _minions_output(out, agent, ignore_dead):
				881	__context__['retcode'] = 2
				882	return False
				883
				884	failed_minions = []
				885	verified_minions = []
				886	verified_volumes = []
				887	for minion in out:
				888	verified_minions.append(minion)
				889	gluster_json = out[minion]['ret']
				890	for volume in gluster_json:
				891	if volume in ignore_volumes:
				892	continue
				893	else:
				894	verified_volumes.append(volume)
				895	alive_bricks = 0
				896	if 'bricks' not in gluster_json[volume]:
				897	if minion not in failed_minions:
				898	failed_minions.append(minion)
				899	bricks = gluster_json[volume]['bricks']
				900	if len(bricks) < expected_size:
				901	if minion not in failed_minions:
				902	failed_minions.append(minion)
				903	for brick in bricks:
				904	if brick['online'] == 'Y':
				905	alive_bricks += 1
				906	else:
				907	if minion not in failed_minions:
				908	failed_minions.append(minion)
				909	if alive_bricks < expected_size:
				910	if minion not in failed_minions:
				911	failed_minions.append(minion)
				912
				913	if not _failed_minions(out, agent, failed_minions):
				914	__context__['retcode'] = 2
				915	return False
				916
				917	if kwargs.get("debug", False):
				918	logger.info("Verified minions:")
				919	logger.info(verified_minions)
				920	logger.info("Verified volumes:")
				921	logger.info(verified_volumes)
				922
				923	return True
				924
				925
				926	def ceph_cmd(cmd):
				927
				928	''' JSON formatted ceph command output '''
				929
				930	proc = subprocess.Popen(['ceph'] + cmd.split() + ['--format', 'json-pretty'], stdout=subprocess.PIPE)
				931	stdout, stderr = proc.communicate()
				932
				933	return json.loads(stdout)
				934
				935
				936	def ceph_health_check(target='I@ceph:mon', target_type='compound', expected_status='HEALTH_OK', expected_state='active+clean', ignore_dead=False, **kwargs):
				937
				938	''' Check all ceph monitors health status '''
				939
				940	agent = "ceph health status"
				941	out = __salt__['saltutil.cmd']( tgt=target,
				942	tgt_type=target_type,
				943	fun='health_checks.ceph_cmd',
				944	arg=['status'],
				945	timeout=3
				946	) or None
				947
				948	if not _minions_output(out, agent, ignore_dead):
				949	__context__['retcode'] = 2
				950	return False
				951
				952	failed_minions = []
				953	verified_minions = []
				954	for minion in out:
				955	verified_minions.append(minion)
				956	ceph_json = out[minion]['ret']
				957	fsid = ceph_json['fsid']
				958
				959	if ceph_json['health']['overall_status'] != expected_status:
				960	if minion not in failed_minions:
				961	failed_minions.append(minion)
				962
				963	if ceph_json['osdmap']['osdmap']['full']:
				964	if minion not in failed_minions:
				965	failed_minions.append(minion)
				966
				967	if ceph_json['osdmap']['osdmap']['nearfull']:
				968	if minion not in failed_minions:
				969	failed_minions.append(minion)
				970
				971	num_osds = ceph_json['osdmap']['osdmap']['num_osds']
				972	num_in_osds = ceph_json['osdmap']['osdmap']['num_in_osds']
				973	num_up_osds = ceph_json['osdmap']['osdmap']['num_up_osds']
				974	if not ( num_osds == num_in_osds == num_up_osds ):
				975	if minion not in failed_minions:
				976	failed_minions.append(minion)
				977
				978	quorum = len(ceph_json['quorum'])
				979	quorum_names = len(ceph_json['quorum_names'])
				980	mons = len(ceph_json['monmap']['mons'])
				981	if not ( quorum == quorum_names == mons ):
				982	if minion not in failed_minions:
				983	failed_minions.append(minion)
				984
				985	for mon in ceph_json['health']['timechecks']['mons']:
				986	if mon['health'] != expected_status:
				987	if minion not in failed_minions:
				988	failed_minions.append(minion)
				989
				990	for srv in ceph_json['health']['health']['health_services']:
				991	for mon in srv['mons']:
				992	if mon['health'] != expected_status:
				993	if minion not in failed_minions:
				994	failed_minions.append(minion)
				995
				996	for state in ceph_json['pgmap']['pgs_by_state']:
				997	if state['state_name'] != expected_state:
				998	if minion not in failed_minions:
				999	failed_minions.append(minion)
				1000
				1001	if not _failed_minions(out, agent, failed_minions):
				1002	__context__['retcode'] = 2
				1003	return False
				1004
				1005	if kwargs.get("debug", False):
				1006	logger.info("Quorum:")
				1007	logger.info(ceph_json['quorum_names'])
				1008	logger.info("Verified minions:")
				1009	logger.info(verified_minions)
				1010
				1011	return True
				1012
				1013
				1014	def docker_registry_list(host):
				1015
				1016	''' Retrieve and list docker catalog '''
				1017
				1018	try:
				1019	if host[0:4] == 'http':
				1020	url = host + '/v2/'
				1021	else:
				1022	url = 'http://' + host + '/v2/'
				1023	repos = requests.get(url + '_catalog')
				1024
				1025	versions = {}
				1026	for repo in repos.json()['repositories']:
				1027	repo_versions = requests.get(url + repo + '/tags/list')
				1028	versions[repo] = repo_versions.json().pop('tags')
				1029	return versions
				1030	except:
				1031	return {}