Ceph Info command
Updates
- ceph module with 'info', 'report' and 'bench' commands
- mcp-checker ceph info command is collecting Ceph config
and creates an archive
- ceph report command creates HTML document with
info collected from Ceph cluster
- Basic SMART data output in info and full output in report
- skeleton of the ceph bench command to run synced tests
Fixes
- kube helper commands uses proper naming
Change-Id: Ia5aaa343f7d1c38a67d34e60215801bbb0fea097
Related-PROD: PROD-36605
diff --git a/cfg_checker/common/kube_utils.py b/cfg_checker/common/kube_utils.py
index 042db5d..e1aafbb 100644
--- a/cfg_checker/common/kube_utils.py
+++ b/cfg_checker/common/kube_utils.py
@@ -207,6 +207,13 @@
self._coreV1 = None
self._appsV1 = None
self._podV1 = None
+ self._custom = None
+
+ @property
+ def CustomObjects(self):
+ if not self._custom:
+ self._custom = kclient.CustomObjectsApi(self.kApi)
+ return self._custom
@property
def CoreV1(self):
@@ -298,6 +305,45 @@
return _nodes
+ def get_pod_names_by_partial_name(self, partial_name, ns):
+ logger_cli.debug('... searching for pods with {}'.format(partial_name))
+ _pods = self.CoreV1.list_namespaced_pod(ns)
+ _names = self._get_listed_attrs(_pods.items, "metadata.name")
+ _pnames = [n for n in _names if partial_name in n]
+ if len(_pnames) > 1:
+ logger_cli.debug(
+ "... more than one pod found for '{}': {}\n".format(
+ partial_name,
+ ", ".join(_pnames)
+ )
+ )
+ elif len(_pnames) < 1:
+ logger_cli.warning(
+ "WARNING: No pods found for '{}'".format(partial_name)
+ )
+
+ return _pnames
+
+ def get_pods_by_partial_name(self, partial_name, ns):
+ logger_cli.debug('... searching for pods with {}'.format(partial_name))
+ _all_pods = self.CoreV1.list_namespaced_pod(ns)
+ # _names = self._get_listed_attrs(_pods.items, "metadata.name")
+ _pods = [_pod for _pod in _all_pods.items
+ if partial_name in _pod.metadata.name]
+ if len(_pods) > 1:
+ logger_cli.debug(
+ "... more than one pod found for '{}': {}\n".format(
+ partial_name,
+ ", ".join(partial_name)
+ )
+ )
+ elif len(_pods) < 1:
+ logger_cli.warning(
+ "WARNING: No pods found for '{}'".format(partial_name)
+ )
+
+ return _pods
+
def exec_on_target_pod(
self,
cmd,
@@ -307,6 +353,7 @@
_request_timeout=120,
**kwargs
):
+ _pname = ""
if not strict:
logger_cli.debug(
"... searching for pods with the name '{}'".format(pod_name)
@@ -314,7 +361,6 @@
_pods = {}
_pods = self.CoreV1.list_namespaced_pod(namespace)
_names = self._get_listed_attrs(_pods.items, "metadata.name")
- _pname = ""
_pnames = [n for n in _names if n.startswith(pod_name)]
if len(_pnames) > 1:
logger_cli.debug(
@@ -325,7 +371,7 @@
)
)
_pname = _pnames[0]
- elif len(_pname) < 1:
+ elif len(_pnames) < 1:
raise KubeException("No pods found for '{}'".format(pod_name))
else:
_pname = pod_name
@@ -373,7 +419,9 @@
if _ns is None:
logger_cli.debug("... creating namespace '{}'".format(ns))
- _r = self.CoreV1.create_namespace(ns)
+ _new_ns = kclient.V1Namespace()
+ _new_ns.metadata = kclient.V1ObjectMeta(name=ns)
+ _r = self.CoreV1.create_namespace(_new_ns)
# TODO: check return on fail
if not _r:
return False
@@ -494,7 +542,7 @@
)
# map func and cmd
-
+ logger_cli.error("ERROR: 'exec_on_all_pods'is not implemented yet")
# create result list
return []
@@ -566,3 +614,17 @@
self._coreV1 = None
return
+
+ def get_custom_resource(self, group, version, plural):
+ # Get it
+ # Example:
+ # kubernetes.client.CustomObjectsApi().list_cluster_custom_object(
+ # group="networking.istio.io",
+ # version="v1alpha3",
+ # plural="serviceentries"
+ # )
+ return self.CustomObjects.list_cluster_custom_object(
+ group=group,
+ version=version,
+ plural=plural
+ )
diff --git a/cfg_checker/modules/ceph/__init__.py b/cfg_checker/modules/ceph/__init__.py
new file mode 100644
index 0000000..ad4a207
--- /dev/null
+++ b/cfg_checker/modules/ceph/__init__.py
@@ -0,0 +1,150 @@
+from cfg_checker.common import logger_cli
+from cfg_checker.common.settings import ENV_TYPE_KUBE
+from cfg_checker.helpers import args_utils
+from cfg_checker.modules.ceph import info, bench
+
+command_help = "Ceph Storage information and benchmarks"
+supported_envs = [ENV_TYPE_KUBE]
+
+
+# def _selectClass(_env, strClassHint="checker"):
+# _class = None
+# if _env == ENV_TYPE_SALT:
+# if strClassHint == "info":
+# _class = info.SaltCephInfo
+# elif strClassHint == "bench":
+# _class = bench.SaltCephInfo
+# elif _env == ENV_TYPE_KUBE:
+# if strClassHint == "info":
+# _class = info.KubeCephInfo
+# elif strClassHint == "bench":
+# _class = bench.KubeCephBench
+# if not _class:
+# raise CheckerException(
+# "Unknown hint for selecting Ceph handler Class: '{}'".format(
+# strClassHint
+# )
+# )
+# else:
+# return _class
+
+
+def init_parser(_parser):
+ # network subparser
+ ceph_subparsers = _parser.add_subparsers(dest='type')
+
+ ceph_info_parser = ceph_subparsers.add_parser(
+ 'info',
+ help="Gather Ceph Cluster information"
+ )
+
+ ceph_info_parser.add_argument(
+ '--detailed',
+ action="store_true", default=False,
+ help="Print additional details"
+ )
+
+ ceph_info_parser.add_argument(
+ '--tgz',
+ metavar='ceph_tgz_filename',
+ help="HTML filename to save report"
+ )
+
+ ceph_report_parser = ceph_subparsers.add_parser(
+ 'report',
+ help="Generate network check report"
+ )
+
+ ceph_report_parser.add_argument(
+ '--html',
+ metavar='ceph_html_filename',
+ help="HTML filename to save report"
+ )
+
+ ceph_bench_parser = ceph_subparsers.add_parser(
+ 'bench',
+ help="Run ceph benchmark"
+ )
+
+ ceph_bench_parser.add_argument(
+ '--task-list',
+ metavar='ceph_tasks_filename',
+ help="List file with data for Ceph bench testrun"
+ )
+
+ return _parser
+
+
+def do_info(args, config):
+ # Ceph info
+ # Gather ceph info and create an archive with data
+ args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
+ # check tgz
+ _tgzfile = "ceph_info_archive.tgz" if not args.tgz else args.tgz
+
+ # _class = _selectClass(_env)
+ ceph_info = info.KubeCephInfo(config)
+
+ logger_cli.info("# Collecting Ceph cluster information")
+ logger_cli.warning(
+ "\nWARNING: 'ceph info' has 'Work in progress' status!\n"
+ )
+
+ ceph_info.gather_info()
+
+ # Debug, enable if needed to debug report generation
+ # without actuall data collecting each time
+ # ceph_info.dump_info()
+ # ceph_info.load_info()
+ # end debug
+
+ ceph_info.print_summary()
+ ceph_info.generate_archive(_tgzfile)
+
+ return
+
+
+def do_report(args, config):
+ # Ceph Report
+ # Gather ceph info and create HTML report with all of the data
+ args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
+ _filename = args_utils.get_arg(args, 'html')
+ logger_cli.info("# Ceph cluster Configuration report")
+ logger_cli.warning(
+ "\nWARNING: 'ceph info' has 'Work in progress' status!\n"
+ )
+
+ # _class = _selectClass(_env)
+ ceph_info = info.KubeCephInfo(config)
+ # Debug, enable if needed to debug report generation
+ # without actuall data collecting each time
+ # ceph_info.load_info()
+ # end debug
+ ceph_info.gather_info()
+ ceph_info.get_transposed_latency_table()
+ ceph_info.get_latest_health_readout()
+ ceph_info.create_html_report(_filename)
+
+ return
+
+
+def do_bench(args, config):
+ # Ceph Benchmark using multiple pods
+ # Prepare the tasks and do synced testrun
+ # TODO: html option to create a fancy report
+ args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
+
+ ceph_bench = bench.KubeCephBench(config)
+
+ logger_cli.error("ERROR: To be implemented...")
+
+ # Load tasks
+
+ # Do the testrun
+ ceph_bench.prepare_pods()
+ ceph_bench.run_benchmark()
+
+ # Create report
+ ceph_bench.create_report()
+
+ return
diff --git a/cfg_checker/modules/ceph/bench.py b/cfg_checker/modules/ceph/bench.py
new file mode 100644
index 0000000..28c7929
--- /dev/null
+++ b/cfg_checker/modules/ceph/bench.py
@@ -0,0 +1,48 @@
+from cfg_checker.common import logger_cli
+# from cfg_checker.common.exception import InvalidReturnException
+# from cfg_checker.common.exception import ConfigException
+# from cfg_checker.common.exception import KubeException
+
+from cfg_checker.nodes import KubeNodes
+
+
+class CephBench(object):
+ def __info__(
+ self,
+ config
+ ):
+ self.env_config = config
+ return
+
+ def prepare_pods(self):
+
+ return
+
+ def run_benchmark(self):
+
+ return
+
+ # Create report
+ def create_report(self):
+
+ return
+
+
+class SaltCephBench(CephBench):
+ def __init__(
+ self,
+ config
+ ):
+ logger_cli.error("ERROR: Not impelented for Sale environment!")
+
+ # self.master = SaltNodes(config)
+ super(SaltCephBench, self).__init__(
+ config
+ )
+ return
+
+
+class KubeCephBench(CephBench):
+ def __init__(self, config):
+ self.master = KubeNodes(config)
+ super(KubeCephBench, self).__init__(config)
diff --git a/cfg_checker/modules/ceph/info.py b/cfg_checker/modules/ceph/info.py
new file mode 100644
index 0000000..092c1c7
--- /dev/null
+++ b/cfg_checker/modules/ceph/info.py
@@ -0,0 +1,514 @@
+import json
+from time import sleep
+
+
+from cfg_checker.common import logger_cli
+from cfg_checker.common.exception import KubeException
+
+from cfg_checker.helpers.console_utils import Progress
+from cfg_checker.helpers.tgz import TGZFile
+from cfg_checker.nodes import KubeNodes
+from cfg_checker.reports import reporter
+
+
+class CephInfo(object):
+ def __init__(
+ self,
+ config
+ ):
+ self.env_config = config
+ return
+
+ def get_transposed_latency_table(self):
+ _table = {
+ "<dev>": []
+ }
+ for _pfd in self.ceph_info['osd_latency_data']['data']['data']:
+ _table["<dev>"].append({
+ "formatted": " cL/aL ",
+ "commit_latency_ms": "Commit, ms",
+ "apply_latency_ms": "Apply, ms",
+ "commit_latency_ns": "Commit, ns",
+ "apply_latency_ns": "Apply, ns"
+ })
+ for _f in _pfd['osdstats']['osd_perf_infos']:
+ _n = "osd_{}".format(_f['id'])
+ if _n not in _table:
+ _table[_n] = []
+ _table[_n].append({
+ "formatted": "{:>3}/{:<3}".format(
+ _f['perf_stats']['commit_latency_ms'],
+ _f['perf_stats']['apply_latency_ms'],
+ ),
+ "commit_latency_ms": _f['perf_stats']['commit_latency_ms'],
+ "apply_latency_ms": _f['perf_stats']['apply_latency_ms'],
+ "commit_latency_ns": _f['perf_stats']['commit_latency_ns'],
+ "apply_latency_ns": _f['perf_stats']['apply_latency_ns']
+ })
+ self.ceph_info['osd_latency_data']['table'] = _table
+ return _table
+
+ def get_latest_health_readout(self):
+ _h = self.ceph_info['ceph_health']['data']
+ self.ceph_info['ceph_health']['latest'] = {}
+ for _n, _d in _h.items():
+ if not _d:
+ self.ceph_info['ceph_health']['latest'][_n] = {}
+ continue
+ else:
+ # TODO: Consider filtering out or prepare data for the table
+ _date = sorted(_d.keys(), reverse=True)[0]
+ self.ceph_info['ceph_health']['date'] = _date
+ self.ceph_info['ceph_health']['latest'][_n] = _d[_date]
+
+ return self.ceph_info['ceph_health']['latest']
+
+ def print_summary(self):
+ logger_cli.info("\n# Ceph Cluster summary")
+ # Health status
+ _h = self.ceph_info['health_detail']['data']
+ logger_cli.info("Cluster status: {}".format(_h['status']))
+ for _chk, _d in _h['checks'].items():
+ logger_cli.info(
+ "+ {}: {}\n\tSummary: {}".format(
+ _chk,
+ _d['severity'],
+ _d['summary']['message']
+ )
+ )
+ logger_cli.info("\tDetails:")
+ for _item in _d['detail']:
+ logger_cli.info("\t '{}".format(_item['message']))
+
+ # OSD health metrics
+ logger_cli.info("\n# Device health metrics:")
+ _fmt = " {:45} {:^14} {:^9} {:^6} {:^6}"
+ logger_cli.info(
+ _fmt.format(
+ "Device Name",
+ "Info",
+ "Speed",
+ "SMART",
+ "Tempr."
+ )
+ )
+ _latest = self.get_latest_health_readout()
+ for _n, _d in _latest.items():
+ if not _d:
+ logger_cli.info("{:45} {:<10}".format(_n, "<empty>"))
+ continue
+
+ _status = _d['ata_smart_data']['self_test']['status']['passed']
+
+ _status = 'passed' if _status else 'failed'
+ logger_cli.info(
+ _fmt.format(
+ _n,
+ _d['device']['info_name'],
+ _d['interface_speed']['current']['string'],
+ _status,
+ _d['temperature']['current']
+ )
+ )
+
+ # Latency table
+ logger_cli.info(
+ "\n# OSD Latency data ({} iterations, {} sec delay), "
+ "table items 'osd_dev: N:cL/aL'\n"
+ " 'Commit Latency' -> 'cL', 'Apply Latency' -> 'aL'\n".format(
+ self.ceph_info['osd_latency_data']['data']['total'],
+ self.ceph_info['osd_latency_data']['data']['delay']
+ )
+ )
+ _strs = self.get_transposed_latency_table()
+ for _osd, _list in _strs.items():
+ _row = [c["formatted"] for c in _list]
+ logger_cli.info(
+ " {:8}: {}".format(
+ _osd,
+ " ".join(_row)
+ )
+ )
+ logger_cli.info("\n")
+
+ # critical config values
+ # TODO: print/calculate config values
+
+ return
+
+ def dump_info(self):
+ with open('cephdump.json', 'wt') as _f:
+ _f.write(json.dumps(self.ceph_info, indent=2))
+
+ def load_info(self):
+ with open('cephdump.json', 'rt') as _f:
+ self.ceph_info = json.load(_f)
+
+ def generate_archive(self, tgzfilename):
+ if not self.ceph_info:
+ logger_cli.warning(
+ "WARNING: Ceph Info Data not detected. "
+ "Consider check for errors in log."
+ )
+ else:
+ # Create Archive
+ logger_cli.info("-> Generating archive '{}'".format(tgzfilename))
+ _tgz = TGZFile(
+ tgzfilename,
+ label="MCP Checker: Generated Ceph Information"
+ )
+ # Iterate every key and write data to tar file
+ for key, d in self.ceph_info.items():
+ _filename = None
+ # Cast buf to a proper type
+ _buf = None
+ if isinstance(d["data"], dict) or isinstance(d["data"], list):
+ _buf = json.dumps(d["data"], indent=2)
+ _filename = key + ".json"
+ elif isinstance(d["data"], str):
+ _buf = d["data"]
+ _filename = key + ".txt"
+ else:
+ _buf = str(d["data"])
+ _filename = key + ".txt"
+ logger_cli.debug("... writing '{}'".format(_filename))
+ _tgz.add_file(_filename, buf=_buf, replace=True)
+
+ return
+
+ def create_html_report(self, filename):
+ """
+ Create static html showing ceph info report
+
+ :return: none
+ """
+ logger_cli.info("### Generating report to '{}'".format(filename))
+ _report = reporter.ReportToFile(
+ reporter.HTMLCephInfo(self),
+ filename
+ )
+ _report(
+ {
+ "info": self.ceph_info,
+ "cluster": self.cluster_info,
+ "nodes": self.nodes,
+ "ceph_version": self.ceph_version,
+ }
+ )
+ logger_cli.info("-> Done")
+
+ return
+
+
+class SaltCephInfo(CephInfo):
+ def __init__(
+ self,
+ config
+ ):
+ logger_cli.warning("\nWARNING: Not impelented for Salt environment!\n")
+
+ # self.master = SaltNodes(config)
+ super(SaltCephInfo, self).__init__(config)
+ return
+
+
+class KubeCephInfo(CephInfo):
+ ceph_ns = "rook-ceph"
+ ceph_app_label = "rook-ceph-tools"
+ ceph_group = "ceph.rook.io"
+ ceph_apiversion = "v1"
+ ceph_plural = "cephclusters"
+ ceph_version = "unknown"
+
+ def __init__(self, config):
+ self.master = KubeNodes(config)
+ super(KubeCephInfo, self).__init__(config)
+ # Init ceph tools pod
+ self.pod_name = self._get_tools_pod_name()
+ self.ceph_info = {}
+ self.cluster_info = {}
+ self.ceph_version = self.get_ceph_cluster_config()
+
+ def _safe_tools_cmd(self, cmd, expect_output=True):
+ _r = self.master.exec_cmd_on_target_pod(
+ self.pod_name,
+ self.ceph_ns,
+ cmd
+ )
+ if expect_output and not _r:
+ logger_cli.debug("... got empty output for '{}'".format(cmd))
+ elif not expect_output and _r:
+ logger_cli.warning(
+ "WARNING: Unexpected output for '{}':\n"
+ "===== Start\n{}\n===== End".format(cmd, _r)
+ )
+ return _r
+
+ def _safe_get_cmd_output_as_json(self, cmd):
+ _buf = self._safe_tools_cmd(cmd)
+ try:
+ return json.loads(_buf)
+ except ValueError:
+ logger_cli.error(
+ "\nERROR: failed to parse json: '{}'".format(_buf)
+ )
+ return _buf
+
+ def _get_tools_pod_name(self):
+ # get ceph pod
+ _names = self.master.kube.get_pod_names_by_partial_name(
+ self.ceph_app_label,
+ self.ceph_ns
+ )
+ if not _names:
+ raise KubeException(
+ "Failed to find pod using '{}'".format(self.ceph_app_label)
+ )
+ elif len(_names) > 1:
+ logger_cli.warning(
+ "WARNING: Environment has more than one pod "
+ "with '{}' app: {}".format(
+ self.ceph_app_label,
+ ", ".join(_names)
+ )
+ )
+ else:
+ logger_cli.debug("... found '{}'".format(_names[0]))
+ return _names[0]
+
+ def _add_ceph_info_item(self, key, title, data):
+ if key in self.ceph_info:
+ self.ceph_info[key]["title"] = title
+ self.ceph_info[key]["data"] = data
+ else:
+ self.ceph_info[key] = {
+ "title": title,
+ "data": data
+ }
+
+ def _parse_dev_classes(self, deviceClasses):
+ _devClasses = []
+ for _i in deviceClasses:
+ _devClasses += list(_i.values())
+ return set(_devClasses)
+
+ def get_ceph_cluster_config(self):
+ # get cephclusters resource
+ logger_cli.info("# Loading '{}' object of type '{}/{}'".format(
+ self.ceph_plural,
+ self.ceph_group,
+ self.ceph_apiversion
+ ))
+ _r = self.master.kube.get_custom_resource(
+ self.ceph_group,
+ self.ceph_apiversion,
+ self.ceph_plural,
+ )
+ # find cluster
+ _cluster = None
+ if len(_r['items']) < 1:
+ logger_cli.warning(
+ "WARNING: Failed to find '{}' ({}/{})".format(
+ self.ceph_plural,
+ self.ceph_group,
+ self.ceph_apiversion
+ )
+ )
+ return 'uknown'
+ elif len(_r['items']) > 1:
+ logger_cli.warning(
+ "WARNING: Multiple clusters found '{}' ({}/{})".format(
+ self.ceph_plural,
+ self.ceph_group,
+ self.ceph_apiversion
+ )
+ )
+ _cluster = _r['items'][0]
+ _s = _cluster['status']
+ self.cluster_info.update({
+ 'image': _s['version']['image'],
+ 'version': _s['version']['version'],
+ 'device_classes': self._parse_dev_classes(
+ _s['storage']['deviceClasses']
+ ),
+ 'phase': _s['phase'],
+ 'state': _s['state'],
+ 'health': _s['ceph']['health'],
+ 'previousHealth': _s['ceph']['previousHealth'],
+ 'lastChanged': _s['ceph']['lastChanged'],
+ 'lastChecked': _s['ceph']['lastChecked'],
+ 'mon_count': _cluster['spec']['mon']['count']
+ })
+ self.nodes = _cluster['spec']['storage']['nodes'],
+ logger_cli.info("-> Found Ceph cluster: {} ({})".format(
+ self.cluster_info['version'],
+ self.cluster_info['image']
+ ))
+ return self.cluster_info['version']
+
+ def gather_info(self):
+ logger_cli.info("# Gathering Ceph cluster info")
+ # Collect info
+ _c = self._safe_tools_cmd
+ _cj = self._safe_get_cmd_output_as_json
+ # Crush Map
+ logger_cli.info("-> Collecting CRUSH map")
+ _cmap_tmp_path = "/tmp/crushmap.bin"
+ _r = _c(
+ "ceph osd getcrushmap -o " + _cmap_tmp_path,
+ expect_output=False
+ )
+ # TODO: Handle errors in _r
+ logger_cli.debug("... 'getcrushmap' return value is: '{}'".format(_r))
+
+ # Get Crush map as json and text
+ self._add_ceph_info_item(
+ "crushmap_json",
+ "Crush Map (json)",
+ _cj("crushtool -i " + _cmap_tmp_path + " --dump")
+ )
+ # _crushmap = _cj("crushtool -i " + _cmap_tmp_path + " --dump")
+ self._add_ceph_info_item(
+ "crushmap_text",
+ "Crush Map (text)",
+ _c("crushtool -d " + _cmap_tmp_path)
+ )
+
+ logger_cli.info("-> Collecting ceph osd crush dump")
+ self._add_ceph_info_item(
+ "osd_crushdump",
+ "Crush dump (osd)",
+ _cj("ceph osd crush dump")
+ )
+
+ logger_cli.info("-> Collecting cluster status")
+ self._add_ceph_info_item(
+ "cluster_status",
+ "Cluster status",
+ _cj("ceph -s -f json")
+ )
+
+ logger_cli.info("-> Collecting health detail")
+ self._add_ceph_info_item(
+ "health_detail",
+ "Health details",
+ _cj("ceph -f json health detail")
+ )
+
+ logger_cli.info("-> Collecting monmap")
+ self._add_ceph_info_item(
+ "monmap",
+ "Ceph Mon map",
+ _cj("ceph mon dump -f json")
+ )
+
+ logger_cli.info("-> Collecting ceph df")
+ self._add_ceph_info_item(
+ "ceph_df",
+ "Ceph DF",
+ _cj("ceph df -f json")
+ )
+
+ logger_cli.info("-> Collecting ceph osd df")
+ self._add_ceph_info_item(
+ "ceph_osd_df",
+ "Ceph OSD DF",
+ _cj("ceph osd df -f json")
+ )
+
+ logger_cli.info("-> Collecting ceph osd dump")
+ self._add_ceph_info_item(
+ "ceph_osd_dump",
+ "Ceph OSD dump",
+ _cj("ceph osd dump -f json")
+ )
+
+ logger_cli.info("-> Collecting rados df")
+ self._add_ceph_info_item(
+ "rados_df",
+ "Rados DF",
+ _cj("rados df -f json")
+ )
+
+ logger_cli.info("-> Collecting ceph report")
+ self._add_ceph_info_item(
+ "ceph_report",
+ "Ceph Report",
+ _cj("ceph report")
+ )
+
+ logger_cli.info("-> Collecting auth data anonymized")
+ _auth_data = _cj("ceph auth list -f json")
+ # Anonymize data
+ # _cj("ceph auth list -f json | sed 's/AQ[^=]*==/KEY/g'")
+ for item in _auth_data["auth_dump"]:
+ if "key" in item:
+ item['key'] = "key-data-redacted"
+ self._add_ceph_info_item(
+ "ceph_auth_ls",
+ "Ceph Auth Data (anonymized)",
+ _auth_data
+ )
+
+ logger_cli.info("-> Collecting ceph pg dump")
+ self._add_ceph_info_item(
+ "ceph_pg_dump",
+ "Ceph PG dump",
+ _cj("ceph pg dump -f json")
+ )
+
+ logger_cli.info("-> Collecting ceph running configuration")
+ self._add_ceph_info_item(
+ "ceph_config_dump",
+ "Ceph Configuration Dump",
+ _cj("ceph config dump -f json")
+ )
+
+ logger_cli.info("-> Collecting health metrics")
+ _health_metrics = {}
+ _devices = _c("ceph device ls")
+ for device in _devices.splitlines():
+ _t = device.split()
+ _osd = _t[2]
+ _dev = _t[0]
+ if _dev == "DEVICE":
+ continue
+ _metric = _cj("ceph device get-health-metrics {}".format(_dev))
+ _health_metrics["{}_{}".format(_osd, _dev)] = _metric
+ self._add_ceph_info_item(
+ "ceph_health",
+ "Ceph Health Metrics",
+ _health_metrics
+ )
+
+ # Latency values
+ # config const for set
+ _latency_count = 10
+ _latency_delay = 4
+ logger_cli.info(
+ "-> Collecting ceph osd latency data "
+ "({} total, {} sec delay)".format(
+ _latency_count,
+ _latency_delay
+ )
+ )
+ _osd_lat = {
+ "total": _latency_count,
+ "delay": _latency_delay,
+ "data": []
+ }
+ _progress = Progress(_latency_count)
+ _index = 1
+ while _index <= _latency_count:
+ _progress.write_progress(_index)
+ _osd_lat["data"].append(_cj("ceph osd perf -f json"))
+ sleep(_latency_delay)
+ _index += 1
+ _progress.end()
+ self._add_ceph_info_item(
+ "osd_latency_data",
+ "OSD Latency metrics",
+ _osd_lat
+ )
+
+ return
diff --git a/cfg_checker/modules/network/__init__.py b/cfg_checker/modules/network/__init__.py
index 4c95ef3..a99fa9e 100644
--- a/cfg_checker/modules/network/__init__.py
+++ b/cfg_checker/modules/network/__init__.py
@@ -154,7 +154,7 @@
config
)
# Start command
- logger_cli.info("# Network report (check, node map")
+ logger_cli.info("# Network report (check, node map)")
_filename = args_utils.get_arg(args, 'html')
_skip, _skip_file = args_utils.get_skip_args(args)
diff --git a/cfg_checker/modules/network/pinger.py b/cfg_checker/modules/network/pinger.py
index 04a5f68..60c80cc 100644
--- a/cfg_checker/modules/network/pinger.py
+++ b/cfg_checker/modules/network/pinger.py
@@ -401,7 +401,7 @@
"targets.json"
)
# execute ping.py
- _result = self.mapper.master.exec_on_target_pod(
+ _result = self.mapper.master.exec_script_on_target_pod(
_pname,
"ping.py",
args=[_path]
diff --git a/cfg_checker/nodes.py b/cfg_checker/nodes.py
index d87d829..ef2219c 100644
--- a/cfg_checker/nodes.py
+++ b/cfg_checker/nodes.py
@@ -1043,7 +1043,7 @@
logger_cli.error("Timed out waiting for Daemonset to be ready")
return False
- def exec_on_target_pod(self, pod_name, script_filename, args=None):
+ def exec_script_on_target_pod(self, pod_name, script_filename, args=None):
"""
Run script from configmap on target pod assuming it is present
"""
@@ -1064,6 +1064,18 @@
)
return _result
+ def exec_cmd_on_target_pod(self, pod_name, ns, command_str):
+ """
+ Run script from configmap on target pod assuming it is present
+ """
+ _result = self.kube.exec_on_target_pod(
+ command_str,
+ pod_name,
+ ns,
+ strict=True
+ )
+ return _result
+
def execute_script_on_daemon_set(self, ds, script_filename, args=None):
"""
Query daemonset for pods and execute script on all of them
diff --git a/cfg_checker/reports/reporter.py b/cfg_checker/reports/reporter.py
index 7ddbc4f..1f54ff3 100644
--- a/cfg_checker/reports/reporter.py
+++ b/cfg_checker/reports/reporter.py
@@ -32,6 +32,11 @@
return text.replace("\n", "<br />")
+def tabstops(text):
+ # replace python linebreaks with html breaks
+ return text.replace("\t", "	")
+
+
def get_sorted_keys(td):
# detect if we can sort by desc
# Yes, this is slow, but bullet-proof from empty desc
@@ -116,6 +121,78 @@
return _text
+def to_gb(bytes_str):
+ _bytes = int(bytes_str)
+ _gb = _bytes / 1024 / 1024 / 1024
+ return "{}".format(round(_gb, 2))
+
+
+def to_mb(bytes_str):
+ _bytes = int(bytes_str)
+ _mb = _bytes / 1024 / 1024
+ return "{}".format(round(_mb, 2))
+
+
+def get_bucket_item_name(id, cmap):
+ for buck in cmap["buckets"]:
+ if id == buck["id"]:
+ return buck["name"]
+ for dev in cmap["devices"]:
+ if id == dev["id"]:
+ return dev["name"]
+ return id
+
+
+def get_rule_steps(steps):
+ _steps = []
+ for step in steps:
+ _ops = step.pop("op").split('_')
+ if "take" in _ops:
+ _steps.append(
+ "step {} {}".format(
+ " ".join(_ops),
+ step["item_name"]
+ )
+ )
+ else:
+ _steps.append(
+ "step {} {}".format(
+ " ".join(_ops),
+ " ".join(["{} {}".format(k, v) for k, v in step.items()])
+ )
+ )
+ return _steps
+
+
+def get_osdmap(cs):
+ _osdmap = cs
+ while True:
+ _keys = list(_osdmap.keys())
+ for _k in _keys:
+ if _k == "osdmap":
+ _osdmap = _osdmap[_k]
+ break
+ elif _k == 'epoch':
+ return _osdmap
+ return {
+ "epoch": 0,
+ "num_osds": 0,
+ "num_up_osds": 0,
+ "osd_up_since": 0,
+ "num_in_osds": 0,
+ "osd_in_since": 0,
+ "num_remapped_pgs": 0
+ }
+
+
+def get_pool_stats(id, pgdump):
+ _stats = {}
+ for pool in pgdump["pg_map"]["pool_stats"]:
+ if id == pool["poolid"]:
+ _stats = pool
+ return _stats
+
+
@six.add_metaclass(abc.ABCMeta)
class _Base(object):
def __init__(self, master=None):
@@ -167,6 +244,12 @@
self.jinja2_env.filters['pkg_action_class'] = make_pkg_action_class
self.jinja2_env.filters['node_status_class'] = make_node_status
self.jinja2_env.filters['pkg_repo_info'] = make_repo_info
+ self.jinja2_env.filters['to_gb'] = to_gb
+ self.jinja2_env.filters['to_mb'] = to_mb
+ self.jinja2_env.filters['get_bucket_item_name'] = get_bucket_item_name
+ self.jinja2_env.filters['get_rule_steps'] = get_rule_steps
+ self.jinja2_env.filters['get_pool_stats'] = get_pool_stats
+ self.jinja2_env.filters['get_osdmap'] = get_osdmap
# render!
logger_cli.info("-> Using template: {}".format(self.tmpl))
@@ -195,6 +278,11 @@
tmpl = "pkg_versions_html.j2"
+# HTML Ceph information report
+class HTMLCephInfo(_TMPLBase):
+ tmpl = "ceph_info_html.j2"
+
+
# Package versions report
class HTMLModelCompare(_TMPLBase):
tmpl = "model_tree_cmp_tmpl.j2"