Additions and fixes to network check
- Per interface tree maps
- proper virtial nodes detection
- KVM nodes listing
- CPU count fix
- Basic service fail check (wip)
Change-Id: I62b68793404eeff957ef70468c954df2fda869a5
Related-PROD: PROD-38972
diff --git a/cfg_checker/common/file_utils.py b/cfg_checker/common/file_utils.py
index 9c043a8..c550184 100644
--- a/cfg_checker/common/file_utils.py
+++ b/cfg_checker/common/file_utils.py
@@ -48,9 +48,9 @@
def read_file_as_lines(filename):
_list = []
- with open(filename, 'r') as fr:
+ with open(filename, 'rt') as fr:
for line in fr:
- _list.append(line)
+ _list.append(line.rstrip())
return _list
diff --git a/cfg_checker/modules/network/mapper.py b/cfg_checker/modules/network/mapper.py
index ba9a256..cea81bf 100644
--- a/cfg_checker/modules/network/mapper.py
+++ b/cfg_checker/modules/network/mapper.py
@@ -243,6 +243,123 @@
# data is already there, just add VIP
net_data['ifs'].append(_if)
+ def process_interface(lvl, interface, tree, res):
+ # get childs for each root
+ # tree row item (<if_name>, [<parents>], [<childs>])
+ if lvl not in tree:
+ # - no level - add it
+ tree[lvl] = {}
+ # there is such interface in this level?
+ if interface not in tree[lvl]:
+ # - IF not present
+ # -- get parents, add
+ _p = res[interface]['lower']
+ # if None, put empty list
+ _p = _p if _p else []
+ # -- get childs, add
+ _c = res[interface]['upper']
+ # if None, put empty list
+ _c = _c if _c else []
+ tree[lvl].update({
+ interface: {
+ "parents": _p,
+ "children": _c,
+ "size": len(_p) if len(_p) > len(_c) else len(_c)
+ }
+ })
+ for p_if in tree[lvl][interface]["parents"]:
+ # -- cycle: execute process for next parent, lvl-1
+ process_interface(lvl-1, p_if, tree, res)
+ for c_if in tree[lvl][interface]["children"]:
+ # -- cycle: execute process for next child, lvl+1
+ process_interface(lvl+1, c_if, tree, res)
+ else:
+ # - IF present - exit (been here already)
+ return
+
+ def _put(cNet, cIndex, _list):
+ for _cI in range(cIndex, len(_list)):
+ # add child per index
+ # if space is free
+ if not _list[_cI]:
+ _list[_cI] = cNet
+ break
+
+ # build network hierachy
+ nr = node_data['networks']
+ # walk interface tree
+ for _ifname in node_data['networks']:
+ _tree = {}
+ _level = 0
+ process_interface(_level, _ifname, _tree, nr)
+ # save tree for node/if
+ node_data['networks'][_ifname]['tree'] = _tree
+
+ # debug, print built tree
+ # logger_cli.debug("# '{}'".format(_ifname))
+ lvls = _tree.keys()
+ lvls.sort()
+ n = len(lvls)
+ m = max([len(_tree[k].keys()) for k in _tree.keys()])
+ matrix = [["" for i in range(m)] for j in range(n)]
+ x = 0
+ while True:
+ _lv = lvls.pop(0)
+ # get all interfaces on this level
+ nets = _tree[_lv].keys()
+ while True:
+ y = 0
+ # get next interface
+ _net = nets.pop(0)
+ # all nets
+ _a = [_net]
+ # put current interface if this is only one left
+ if not _tree[_lv][_net]['children']:
+ if _net not in matrix[x]:
+ _put(_net, y, matrix[x])
+ y += 1
+ else:
+ # get all nets with same child
+ for _c in _tree[_lv][_net]['children']:
+ for _o_net in nets:
+ if _c in _tree[_lv][_o_net]['children']:
+ _a.append(_o_net)
+ # flush collected nets
+ for idx in range(len(_a)):
+ if _a[idx] in matrix[x]:
+ # there is such interface on this level
+ # get index
+ _nI = matrix[x].index(_a[idx])
+ _put(_c, _nI, matrix[x+1])
+ else:
+ # there is no such interface
+ # add it
+ for _nI in range(len(matrix[x])):
+ if not matrix[x][_nI]:
+ matrix[x][_nI] = _a[idx]
+ # also, put child
+ _put(_c, _nI, matrix[x+1])
+ break
+ # remove collected nets from processing
+ if _a[idx] in nets:
+ nets.remove(_a[idx])
+ y += len(_a)
+ if not nets:
+ x += 1
+ break
+ if not lvls:
+ break
+
+ lines = []
+ _columns = [len(max([i for i in li])) for li in matrix]
+ for idx_y in range(m):
+ line = ""
+ for idx_x in range(n):
+ _fmt = "{" + ":{}".format(_columns[idx_x]) + "} "
+ line += _fmt.format(matrix[idx_x][idx_y])
+ lines.append(line)
+ node_data['networks'][_ifname]['matrix'] = matrix
+ node_data['networks'][_ifname]['lines'] = lines
return _runtime
def map_network(self, source):
@@ -282,7 +399,11 @@
self.errors.NET_NO_RUNTIME_NETWORK,
reclass_net=str(network)
)
- logger_cli.info(" {:-^50}".format(" No runtime network "))
+ logger_cli.warn(
+ "WARN: {}: {}".format(
+ " No runtime network ", str(network)
+ )
+ )
continue
# hostnames
names = sorted(_runtime[network].keys())
@@ -464,6 +585,8 @@
"interface": _if_name,
"interface_error": _if_rc,
"interface_note": _if_name_suffix,
+ "interface_map": "\n".join(_host['lines']),
+ "interface_matrix": _host['matrix'],
"ip_address": _ip_str,
"address_type": _proto,
"rt_mtu": _host['mtu'],
diff --git a/cfg_checker/modules/reclass/comparer.py b/cfg_checker/modules/reclass/comparer.py
index 6591d16..8ef8894 100644
--- a/cfg_checker/modules/reclass/comparer.py
+++ b/cfg_checker/modules/reclass/comparer.py
@@ -62,7 +62,7 @@
_size = f.tell()
# TODO: do smth with the data
if not _yaml:
- logger_cli.warning("WARN: empty file '{}'".format(fname))
+ # logger.warning("WARN: empty file '{}'".format(fname))
_yaml = {}
else:
logger.debug("...loaded YAML '{}' ({}b)".format(fname, _size))
@@ -150,6 +150,11 @@
# ignore _source key
if k == "_source":
continue
+ # ignore secrets
+ if isinstance(k, str) and k == "secrets.yml":
+ continue
+ if isinstance(k, str) and k.find("_password") > 0:
+ continue
# check if this is an env name cluster entry
if dict2 is not None and \
k == self.model_name_1 and \
diff --git a/cfg_checker/nodes.py b/cfg_checker/nodes.py
index ca4e261..0ca1e85 100644
--- a/cfg_checker/nodes.py
+++ b/cfg_checker/nodes.py
@@ -147,7 +147,7 @@
}
return _info
- def get_cmd_for_nodes(self, cmd, target_key, target_dict=None):
+ def get_cmd_for_nodes(self, cmd, target_key, target_dict=None, nodes=None):
"""Function runs. cmd.run and parses result into place
or into dict structure provided
@@ -160,8 +160,9 @@
_nodes = target_dict
else:
_nodes = self.nodes
- _result = self.execute_cmd_on_active_nodes(cmd)
+ _result = self.execute_cmd_on_active_nodes(cmd, nodes=nodes)
for node, data in _nodes.iteritems():
+
if node in self.skip_list:
logger_cli.debug(
"... '{}' skipped while collecting '{}'".format(
@@ -176,6 +177,8 @@
# Save data
if data['status'] == const.NODE_DOWN:
data[target_key] = None
+ elif node not in _result:
+ continue
elif not _result[node]:
logger_cli.debug(
"... '{}' not responded after '{}'".format(
@@ -369,11 +372,11 @@
self.not_responded = [_n for _n in _r.keys() if not _r[_n]]
return _r
- def execute_cmd_on_active_nodes(self, cmd):
+ def execute_cmd_on_active_nodes(self, cmd, nodes=None):
# execute cmd
self.not_responded = []
_r = self.salt.cmd(
- self.active_nodes_compound,
+ nodes if nodes else self.active_nodes_compound,
'cmd.run',
param=cmd,
expr_form="compound"
diff --git a/cfg_checker/reports/reporter.py b/cfg_checker/reports/reporter.py
index 8059fab..7aa376a 100644
--- a/cfg_checker/reports/reporter.py
+++ b/cfg_checker/reports/reporter.py
@@ -1,9 +1,11 @@
import abc
import os
+import re
import time
from cfg_checker.common import const
from cfg_checker.common import logger_cli
+from cfg_checker.common.file_utils import read_file_as_lines
from cfg_checker.nodes import salt_master
import jinja2
@@ -19,6 +21,10 @@
_disk_critical = 90
_ram_warn = 5
_ram_critical = 3
+_softnet_interval = 5
+
+UP = const.NODE_UP
+DOWN = const.NODE_DOWN
def line_breaks(text):
@@ -217,16 +223,61 @@
else:
return int(value)
- def _lscpu(field, key, _dict):
+ def _lscpu(_dict):
+ _key = "lscpu"
+ _key_r = "lscpu_raw"
+ # get all of the values
_f_cmd = salt_master.get_cmd_for_nodes
- _cmd = "lscpu | grep -e \"^{}:\" | cut -d\":\" -f2 " \
- "| sed -e 's/^[[:space:]]*//'"
- _f_cmd(_cmd.format(field), key, target_dict=_dict)
+ _cmd = "lscpu | sed -n '/\\:/s/ \\+/ /gp'"
+ _f_cmd(_cmd, _key_r, target_dict=_dict)
+ # parse them and put into dict
+ for node, dt in _dict.iteritems():
+ dt[_key] = {}
+ if dt['status'] == DOWN:
+ continue
+ lines = dt[_key_r].splitlines()
+ for line in lines:
+ li = line.split(':')
+ _var_name = li[0].lower()
+ _var_name = re.sub(' ', '_', _var_name)
+ _var_name = re.sub('|'.join(['\\(', '\\)']), '', _var_name)
+ _var_value = li[1].strip()
+ dt[_key][_var_name] = _var_value
+ dt.pop(_key_r)
+ # detect virtual nodes
+ if "hypervisor_vendor" in dt[_key]:
+ dt['node_type'] = "virtual"
+ else:
+ dt['node_type'] = "physical"
- def _free(field, key, _dict):
+ def _free(_dict):
+ _key = "ram"
+ _key_r = "ram_raw"
_f_cmd = salt_master.get_cmd_for_nodes
- _cmd = "free -h | sed -n '/Mem/s/ \\+/ /gp' | cut -d\" \" -f {}"
- _f_cmd(_cmd.format(field), key, target_dict=_dict)
+ _cmd = "free -h | sed -n '/Mem/s/ \\+/ /gp'"
+ _f_cmd(_cmd, _key_r, target_dict=_dict)
+ # parse them and put into dict
+ for node, dt in _dict.iteritems():
+ dt[_key] = {}
+ if dt['status'] == DOWN:
+ continue
+ li = dt[_key_r].split()
+ dt[_key]['total'] = li[1]
+ dt[_key]['used'] = li[2]
+ dt[_key]['free'] = li[3]
+ dt[_key]['shared'] = li[4]
+ dt[_key]['cache'] = li[5]
+ dt[_key]['available'] = li[6]
+
+ _total = get_bytes(li[1])
+ _avail = get_bytes(li[6])
+ _m = _avail * 100.0 / _total
+ if _m < _ram_critical:
+ dt[_key]["status"] = "fail"
+ elif _m < _ram_warn:
+ dt[_key]["status"] = "warn"
+ else:
+ dt[_key]["status"] = ""
def _services(_dict):
_key = "services"
@@ -236,6 +287,8 @@
_f_cmd(_cmd, _key_r, target_dict=_dict)
for node, dt in _dict.iteritems():
dt[_key] = {}
+ if dt['status'] == DOWN:
+ continue
lines = dt[_key_r].splitlines()
for line in lines:
li = line.split()
@@ -249,11 +302,107 @@
dt[_key][_name] = None
dt.pop(_key_r)
+ def _vcp_status(_dict):
+ _key = "virsh"
+ _key_r = "virsh_raw"
+ salt_master.get_cmd_for_nodes(
+ "virsh list --all | sed -n -e '/[0-9]/s/ \\+/ /gp'",
+ _key_r,
+ target_dict=_dict,
+ nodes="kvm*"
+ )
+ _kvm = filter(lambda x: x.find("kvm") >= 0, _dict.keys())
+ for node in _kvm:
+ dt = _dict[node]
+ dt[_key] = {}
+ if dt['status'] == DOWN:
+ continue
+ lines = dt[_key_r].splitlines()
+ for line in lines:
+ li = line.split()
+ _id = li[0]
+ _name = li[1]
+ _status = li[2]
+ dt[_key][_name] = {
+ 'id': _id,
+ 'status': _status
+ }
+ dt.pop(_key_r)
+
+ # query per-cpu and count totals
+ # total (0), dropped(1), squeezed (2), collision (7)
+ def _soft_net_stats(_dict):
+ _key = "net_stats"
+ _key_r = "net_stats_raw"
+ _f_cmd = salt_master.get_cmd_for_nodes
+ _cmd = "cat /proc/net/softnet_stat; echo \\#; " \
+ "sleep {}; cat /proc/net/softnet_stat".format(
+ _softnet_interval
+ )
+ _f_cmd(_cmd, _key_r, target_dict=_dict)
+ for node, dt in _dict.iteritems():
+ _cpuindex = 1
+ _add_mode = True
+ # final totals
+ dt[_key] = {
+ "total": [0, 0, 0, 0]
+ }
+ # totals for start mark
+ _ts = [0, 0, 0, 0]
+ # skip if node is down
+ if dt['status'] == DOWN:
+ continue
+ lines = dt[_key_r].splitlines()
+ for line in lines:
+ if line.startswith("#"):
+ _add_mode = False
+ _cpuindex = 1
+ continue
+ li = line.split()
+ _c = [
+ int(li[0], 16),
+ int(li[1], 16),
+ int(li[2], 16),
+ int(li[7], 16)
+ ]
+ _id = "cpu{:02}".format(_cpuindex)
+ if _id not in dt[_key]:
+ dt[_key][_id] = []
+ _dc = dt[_key][_id]
+ if _add_mode:
+ # saving values and adding totals
+ dt[_key][_id] = _c
+ # save start totals
+ _ts = [_ts[i]+_c[i] for i in range(0, len(_c))]
+ else:
+ # this is second measurement
+ # subtract all values
+ for i in range(len(_c)):
+ dt[_key][_id][i] = _c[i] - _dc[i]
+ dt[_key]["total"][i] += _c[i]
+ _cpuindex += 1
+ # finally, subtract initial totals
+ for k, v in dt[_key].iteritems():
+ if k != "total":
+ dt[_key][k] = [v[i] / 5. for i in range(len(v))]
+ else:
+ dt[_key][k] = [(v[i]-_ts[i])/5. for i in range(len(v))]
+ dt.pop(_key_r)
+
+ # prepare yellow and red marker values
data["const"] = {
+ "net_interval": _softnet_interval,
"ram_warn": _ram_warn,
"ram_critical": _ram_critical,
"disk_warn": _disk_warn,
- "disk_critical": _disk_critical
+ "disk_critical": _disk_critical,
+ "services": read_file_as_lines(
+ os.path.join(
+ pkg_dir,
+ 'etc',
+ 'services.list'
+ )
+ )
}
# get kernel version
@@ -262,30 +411,12 @@
"kernel",
target_dict=data["nodes"]
)
- # cpu info
- # Sample: VT-x, KVM, full
- _lscpu("Virtualization", "virt_mode", data["nodes"])
- _lscpu("Hypervisor vendor", "virt_vendor", data["nodes"])
- _lscpu("Virtualization type", "virt_type", data["nodes"])
- # sample: 4
- _lscpu("CPU(s)", "cpus", data["nodes"])
+ # process lscpu data
+ _lscpu(data["nodes"])
# free ram
# sample: 16425392 14883144 220196
- _free("2", "ram_total", data["nodes"])
- _free("3", "ram_used", data["nodes"])
- _free("4", "ram_free", data["nodes"])
- _free("7", "ram_available", data["nodes"])
- for _data in data["nodes"].itervalues():
- _total = get_bytes(_data["ram_total"])
- _avail = get_bytes(_data["ram_available"])
- _m = _avail * 100.0 / _total
- if _m < _ram_critical:
- _data["ram_status"] = "fail"
- elif _m < _ram_warn:
- _data["ram_status"] = "warn"
- else:
- _data["ram_status"] = ""
+ _free(data["nodes"])
# disk space
# sample: /dev/vda1 78G 33G 45G 43%
@@ -329,6 +460,12 @@
_err if d['subnet_gateway_error'] else ""
_services(data["nodes"])
+ # vcp status
+ # query virsh and prepare for report
+ _vcp_status(data["nodes"])
+
+ # soft net stats
+ _soft_net_stats(data["nodes"])
class ReportToFile(object):