Additions and fixes to network check
- Per interface tree maps
- proper virtial nodes detection
- KVM nodes listing
- CPU count fix
- Basic service fail check (wip)
Change-Id: I62b68793404eeff957ef70468c954df2fda869a5
Related-PROD: PROD-38972
diff --git a/cfg_checker/reports/reporter.py b/cfg_checker/reports/reporter.py
index 8059fab..7aa376a 100644
--- a/cfg_checker/reports/reporter.py
+++ b/cfg_checker/reports/reporter.py
@@ -1,9 +1,11 @@
import abc
import os
+import re
import time
from cfg_checker.common import const
from cfg_checker.common import logger_cli
+from cfg_checker.common.file_utils import read_file_as_lines
from cfg_checker.nodes import salt_master
import jinja2
@@ -19,6 +21,10 @@
_disk_critical = 90
_ram_warn = 5
_ram_critical = 3
+_softnet_interval = 5
+
+UP = const.NODE_UP
+DOWN = const.NODE_DOWN
def line_breaks(text):
@@ -217,16 +223,61 @@
else:
return int(value)
- def _lscpu(field, key, _dict):
+ def _lscpu(_dict):
+ _key = "lscpu"
+ _key_r = "lscpu_raw"
+ # get all of the values
_f_cmd = salt_master.get_cmd_for_nodes
- _cmd = "lscpu | grep -e \"^{}:\" | cut -d\":\" -f2 " \
- "| sed -e 's/^[[:space:]]*//'"
- _f_cmd(_cmd.format(field), key, target_dict=_dict)
+ _cmd = "lscpu | sed -n '/\\:/s/ \\+/ /gp'"
+ _f_cmd(_cmd, _key_r, target_dict=_dict)
+ # parse them and put into dict
+ for node, dt in _dict.iteritems():
+ dt[_key] = {}
+ if dt['status'] == DOWN:
+ continue
+ lines = dt[_key_r].splitlines()
+ for line in lines:
+ li = line.split(':')
+ _var_name = li[0].lower()
+ _var_name = re.sub(' ', '_', _var_name)
+ _var_name = re.sub('|'.join(['\\(', '\\)']), '', _var_name)
+ _var_value = li[1].strip()
+ dt[_key][_var_name] = _var_value
+ dt.pop(_key_r)
+ # detect virtual nodes
+ if "hypervisor_vendor" in dt[_key]:
+ dt['node_type'] = "virtual"
+ else:
+ dt['node_type'] = "physical"
- def _free(field, key, _dict):
+ def _free(_dict):
+ _key = "ram"
+ _key_r = "ram_raw"
_f_cmd = salt_master.get_cmd_for_nodes
- _cmd = "free -h | sed -n '/Mem/s/ \\+/ /gp' | cut -d\" \" -f {}"
- _f_cmd(_cmd.format(field), key, target_dict=_dict)
+ _cmd = "free -h | sed -n '/Mem/s/ \\+/ /gp'"
+ _f_cmd(_cmd, _key_r, target_dict=_dict)
+ # parse them and put into dict
+ for node, dt in _dict.iteritems():
+ dt[_key] = {}
+ if dt['status'] == DOWN:
+ continue
+ li = dt[_key_r].split()
+ dt[_key]['total'] = li[1]
+ dt[_key]['used'] = li[2]
+ dt[_key]['free'] = li[3]
+ dt[_key]['shared'] = li[4]
+ dt[_key]['cache'] = li[5]
+ dt[_key]['available'] = li[6]
+
+ _total = get_bytes(li[1])
+ _avail = get_bytes(li[6])
+ _m = _avail * 100.0 / _total
+ if _m < _ram_critical:
+ dt[_key]["status"] = "fail"
+ elif _m < _ram_warn:
+ dt[_key]["status"] = "warn"
+ else:
+ dt[_key]["status"] = ""
def _services(_dict):
_key = "services"
@@ -236,6 +287,8 @@
_f_cmd(_cmd, _key_r, target_dict=_dict)
for node, dt in _dict.iteritems():
dt[_key] = {}
+ if dt['status'] == DOWN:
+ continue
lines = dt[_key_r].splitlines()
for line in lines:
li = line.split()
@@ -249,11 +302,107 @@
dt[_key][_name] = None
dt.pop(_key_r)
+ def _vcp_status(_dict):
+ _key = "virsh"
+ _key_r = "virsh_raw"
+ salt_master.get_cmd_for_nodes(
+ "virsh list --all | sed -n -e '/[0-9]/s/ \\+/ /gp'",
+ _key_r,
+ target_dict=_dict,
+ nodes="kvm*"
+ )
+ _kvm = filter(lambda x: x.find("kvm") >= 0, _dict.keys())
+ for node in _kvm:
+ dt = _dict[node]
+ dt[_key] = {}
+ if dt['status'] == DOWN:
+ continue
+ lines = dt[_key_r].splitlines()
+ for line in lines:
+ li = line.split()
+ _id = li[0]
+ _name = li[1]
+ _status = li[2]
+ dt[_key][_name] = {
+ 'id': _id,
+ 'status': _status
+ }
+ dt.pop(_key_r)
+
+ # query per-cpu and count totals
+ # total (0), dropped(1), squeezed (2), collision (7)
+ def _soft_net_stats(_dict):
+ _key = "net_stats"
+ _key_r = "net_stats_raw"
+ _f_cmd = salt_master.get_cmd_for_nodes
+ _cmd = "cat /proc/net/softnet_stat; echo \\#; " \
+ "sleep {}; cat /proc/net/softnet_stat".format(
+ _softnet_interval
+ )
+ _f_cmd(_cmd, _key_r, target_dict=_dict)
+ for node, dt in _dict.iteritems():
+ _cpuindex = 1
+ _add_mode = True
+ # final totals
+ dt[_key] = {
+ "total": [0, 0, 0, 0]
+ }
+ # totals for start mark
+ _ts = [0, 0, 0, 0]
+ # skip if node is down
+ if dt['status'] == DOWN:
+ continue
+ lines = dt[_key_r].splitlines()
+ for line in lines:
+ if line.startswith("#"):
+ _add_mode = False
+ _cpuindex = 1
+ continue
+ li = line.split()
+ _c = [
+ int(li[0], 16),
+ int(li[1], 16),
+ int(li[2], 16),
+ int(li[7], 16)
+ ]
+ _id = "cpu{:02}".format(_cpuindex)
+ if _id not in dt[_key]:
+ dt[_key][_id] = []
+ _dc = dt[_key][_id]
+ if _add_mode:
+ # saving values and adding totals
+ dt[_key][_id] = _c
+ # save start totals
+ _ts = [_ts[i]+_c[i] for i in range(0, len(_c))]
+ else:
+ # this is second measurement
+ # subtract all values
+ for i in range(len(_c)):
+ dt[_key][_id][i] = _c[i] - _dc[i]
+ dt[_key]["total"][i] += _c[i]
+ _cpuindex += 1
+ # finally, subtract initial totals
+ for k, v in dt[_key].iteritems():
+ if k != "total":
+ dt[_key][k] = [v[i] / 5. for i in range(len(v))]
+ else:
+ dt[_key][k] = [(v[i]-_ts[i])/5. for i in range(len(v))]
+ dt.pop(_key_r)
+
+ # prepare yellow and red marker values
data["const"] = {
+ "net_interval": _softnet_interval,
"ram_warn": _ram_warn,
"ram_critical": _ram_critical,
"disk_warn": _disk_warn,
- "disk_critical": _disk_critical
+ "disk_critical": _disk_critical,
+ "services": read_file_as_lines(
+ os.path.join(
+ pkg_dir,
+ 'etc',
+ 'services.list'
+ )
+ )
}
# get kernel version
@@ -262,30 +411,12 @@
"kernel",
target_dict=data["nodes"]
)
- # cpu info
- # Sample: VT-x, KVM, full
- _lscpu("Virtualization", "virt_mode", data["nodes"])
- _lscpu("Hypervisor vendor", "virt_vendor", data["nodes"])
- _lscpu("Virtualization type", "virt_type", data["nodes"])
- # sample: 4
- _lscpu("CPU(s)", "cpus", data["nodes"])
+ # process lscpu data
+ _lscpu(data["nodes"])
# free ram
# sample: 16425392 14883144 220196
- _free("2", "ram_total", data["nodes"])
- _free("3", "ram_used", data["nodes"])
- _free("4", "ram_free", data["nodes"])
- _free("7", "ram_available", data["nodes"])
- for _data in data["nodes"].itervalues():
- _total = get_bytes(_data["ram_total"])
- _avail = get_bytes(_data["ram_available"])
- _m = _avail * 100.0 / _total
- if _m < _ram_critical:
- _data["ram_status"] = "fail"
- elif _m < _ram_warn:
- _data["ram_status"] = "warn"
- else:
- _data["ram_status"] = ""
+ _free(data["nodes"])
# disk space
# sample: /dev/vda1 78G 33G 45G 43%
@@ -329,6 +460,12 @@
_err if d['subnet_gateway_error'] else ""
_services(data["nodes"])
+ # vcp status
+ # query virsh and prepare for report
+ _vcp_status(data["nodes"])
+
+ # soft net stats
+ _soft_net_stats(data["nodes"])
class ReportToFile(object):