[Tooling update] health_checks
* Added:
- rabbitmq queue listing for a node
- CEPH health status check
- Ability to execute arbitrary ceph commands
- Retrieve docker registry repos recursively
- Retrieve GlusterFS pool list
- Retrieve FlusterFS volumes status
- Check GlusterFS pool health and size
- Check GlusterFS volumes bricks health and size
Related-Prod: PROD-29236
Change-Id: I1b12fe39d2d4f190db3cc68a6fe18f919f044eda
diff --git a/_modules/health_checks.py b/_modules/health_checks.py
index c321149..4245d03 100644
--- a/_modules/health_checks.py
+++ b/_modules/health_checks.py
@@ -1,3 +1,4 @@
+import requests
import subprocess
import socket
import salt.utils
@@ -282,6 +283,37 @@
return new_rabbitctl_json
+def rabbitmq_list_queues(vhost='/'):
+
+ ''' JSON formatted RabbitMQ queues list '''
+
+ proc = subprocess.Popen(['rabbitmqctl', 'list_queues' , '-p', vhost], stdout=subprocess.PIPE)
+ stdout, stderr = proc.communicate()
+
+ queues = {}
+ for line in stdout.split('\n'):
+ if re.findall('[0-9]$', line):
+ queue_name, num = re.sub(r"\s+", " ", line).split()
+ queues[queue_name] = int(num)
+
+ return queues
+
+
+def rabbitmq_list_vhosts():
+
+ ''' JSON formatted RabbitMQ vhosts list '''
+
+ proc = subprocess.Popen(['rabbitmqctl', 'list_vhosts'], stdout=subprocess.PIPE)
+ stdout, stderr = proc.communicate()
+
+ vhosts = []
+ for line in stdout.split('\n'):
+ if re.findall('^/', line):
+ vhosts.append(line)
+
+ return vhosts
+
+
def rabbitmq_cmd(cmd):
''' JSON formatted RabbitMQ command output '''
@@ -728,4 +760,272 @@
if kwargs.get("debug", False):
logger.info(verified_minions)
+
return True
+
+
+def gluster_pool_list():
+
+ ''' JSON formatted GlusterFS pool list command output '''
+
+ proc = subprocess.Popen(['gluster', 'pool', 'list'], stdout=subprocess.PIPE)
+ stdout, stderr = proc.communicate()
+
+ regex = re.compile('^(\S+)\s+(\S+)\s+(\S+)$')
+ fields = regex.findall(stdout.split('\n')[0])[0]
+
+ pool = {}
+
+ for line in stdout.split('\n')[1:]:
+ if len(line.strip()) > 0:
+ peer = {}
+ values = regex.findall(line.strip())[0]
+ for i in range(len(fields)):
+ peer[fields[i].lower()] = values[i]
+ uuid = peer.pop('uuid')
+ pool[uuid] = peer
+
+ return pool
+
+
+def gluster_volume_status():
+
+ ''' JSON formatted GlusterFS volumes status command output '''
+
+ proc = subprocess.Popen(['gluster', 'volume', 'status', 'all', 'detail'], stdout=subprocess.PIPE)
+ stdout, stderr = proc.communicate()
+
+ begin_volume = False
+ brick_lookup = False
+ volumes = {}
+ volume_name = ""
+
+ for line in stdout.split('\n'):
+ if 'Status of volume' in line:
+ volume_name = line.split(':')[1].strip()
+ volumes[volume_name] = { 'bricks': [] }
+ begin_volume = True
+ elif len(line.strip()) == 0:
+ if begin_volume:
+ begin_volume = False
+ elif '--------' in line:
+ brick_lookup = True
+ elif brick_lookup and line.split(':')[0].strip() == 'Brick':
+ brick_host, brick_path = re.findall('^Brick\ *:\ (.*)', line)[0].split()[1].split(':')
+ volumes[volume_name]['bricks'].append({ 'host': brick_host, 'path': brick_path })
+ brick_lookup = False
+ else:
+ brick_key, brick_value = line.split(':')
+ brick_key = brick_key.strip().lower().replace(' ', '_')
+ brick_value = brick_value.strip()
+ volumes[volume_name]['bricks'][len(volumes[volume_name]['bricks']) - 1][brick_key] = brick_value
+
+ return volumes
+
+
+def gluster_pool_check(target='I@glusterfs:server', target_type='compound', expected_size=3, ignore_dead=False, **kwargs):
+
+ ''' Check GlusterFS peer status '''
+
+ agent = "glusterfs peer status"
+ out = __salt__['saltutil.cmd']( tgt=target,
+ tgt_type=target_type,
+ fun='health_checks.gluster_pool_list',
+ timeout=3,
+ kwargs='[batch=True]'
+ ) or None
+
+ if not _minions_output(out, agent, ignore_dead):
+ __context__['retcode'] = 2
+ return False
+
+ failed_minions = []
+ verified_minions = []
+ for minion in out:
+ verified_minions.append(minion)
+ gluster_json = out[minion]['ret']
+ alive_peers = []
+ for peer in gluster_json:
+ if gluster_json[peer]['state'] == 'Connected':
+ alive_peers.append(peer)
+ else:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+ if len(alive_peers) < expected_size:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+
+ if not _failed_minions(out, agent, failed_minions):
+ __context__['retcode'] = 2
+ return False
+
+ if kwargs.get("debug", False):
+ logger.info(verified_minions)
+
+ return True
+
+
+def gluster_volumes_check(target='I@glusterfs:server', target_type='compound', expected_size=3, ignore_volumes=[], ignore_dead=False, **kwargs):
+
+ ''' Check GlusterFS volumes status '''
+
+ agent = "glusterfs volumes status"
+ out = __salt__['saltutil.cmd']( tgt=target,
+ tgt_type=target_type,
+ fun='health_checks.gluster_volume_status',
+ timeout=3,
+ kwargs='[batch=True]'
+ ) or None
+
+ if not _minions_output(out, agent, ignore_dead):
+ __context__['retcode'] = 2
+ return False
+
+ failed_minions = []
+ verified_minions = []
+ verified_volumes = []
+ for minion in out:
+ verified_minions.append(minion)
+ gluster_json = out[minion]['ret']
+ for volume in gluster_json:
+ if volume in ignore_volumes:
+ continue
+ else:
+ verified_volumes.append(volume)
+ alive_bricks = 0
+ if 'bricks' not in gluster_json[volume]:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+ bricks = gluster_json[volume]['bricks']
+ if len(bricks) < expected_size:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+ for brick in bricks:
+ if brick['online'] == 'Y':
+ alive_bricks += 1
+ else:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+ if alive_bricks < expected_size:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+
+ if not _failed_minions(out, agent, failed_minions):
+ __context__['retcode'] = 2
+ return False
+
+ if kwargs.get("debug", False):
+ logger.info("Verified minions:")
+ logger.info(verified_minions)
+ logger.info("Verified volumes:")
+ logger.info(verified_volumes)
+
+ return True
+
+
+def ceph_cmd(cmd):
+
+ ''' JSON formatted ceph command output '''
+
+ proc = subprocess.Popen(['ceph'] + cmd.split() + ['--format', 'json-pretty'], stdout=subprocess.PIPE)
+ stdout, stderr = proc.communicate()
+
+ return json.loads(stdout)
+
+
+def ceph_health_check(target='I@ceph:mon', target_type='compound', expected_status='HEALTH_OK', expected_state='active+clean', ignore_dead=False, **kwargs):
+
+ ''' Check all ceph monitors health status '''
+
+ agent = "ceph health status"
+ out = __salt__['saltutil.cmd']( tgt=target,
+ tgt_type=target_type,
+ fun='health_checks.ceph_cmd',
+ arg=['status'],
+ timeout=3
+ ) or None
+
+ if not _minions_output(out, agent, ignore_dead):
+ __context__['retcode'] = 2
+ return False
+
+ failed_minions = []
+ verified_minions = []
+ for minion in out:
+ verified_minions.append(minion)
+ ceph_json = out[minion]['ret']
+ fsid = ceph_json['fsid']
+
+ if ceph_json['health']['overall_status'] != expected_status:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+
+ if ceph_json['osdmap']['osdmap']['full']:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+
+ if ceph_json['osdmap']['osdmap']['nearfull']:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+
+ num_osds = ceph_json['osdmap']['osdmap']['num_osds']
+ num_in_osds = ceph_json['osdmap']['osdmap']['num_in_osds']
+ num_up_osds = ceph_json['osdmap']['osdmap']['num_up_osds']
+ if not ( num_osds == num_in_osds == num_up_osds ):
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+
+ quorum = len(ceph_json['quorum'])
+ quorum_names = len(ceph_json['quorum_names'])
+ mons = len(ceph_json['monmap']['mons'])
+ if not ( quorum == quorum_names == mons ):
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+
+ for mon in ceph_json['health']['timechecks']['mons']:
+ if mon['health'] != expected_status:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+
+ for srv in ceph_json['health']['health']['health_services']:
+ for mon in srv['mons']:
+ if mon['health'] != expected_status:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+
+ for state in ceph_json['pgmap']['pgs_by_state']:
+ if state['state_name'] != expected_state:
+ if minion not in failed_minions:
+ failed_minions.append(minion)
+
+ if not _failed_minions(out, agent, failed_minions):
+ __context__['retcode'] = 2
+ return False
+
+ if kwargs.get("debug", False):
+ logger.info("Quorum:")
+ logger.info(ceph_json['quorum_names'])
+ logger.info("Verified minions:")
+ logger.info(verified_minions)
+
+ return True
+
+
+def docker_registry_list(host):
+
+ ''' Retrieve and list docker catalog '''
+
+ try:
+ if host[0:4] == 'http':
+ url = host + '/v2/'
+ else:
+ url = 'http://' + host + '/v2/'
+ repos = requests.get(url + '_catalog')
+
+ versions = {}
+ for repo in repos.json()['repositories']:
+ repo_versions = requests.get(url + repo + '/tags/list')
+ versions[repo] = repo_versions.json().pop('tags')
+ return versions
+ except:
+ return {}