Alex | 0989ecf | 2022-03-29 13:43:21 -0500 | [diff] [blame] | 1 | # Author: Alex Savatieiev (osavatieiev@mirantis.com; a.savex@gmail.com) |
| 2 | # Copyright 2019-2022 Mirantis, Inc. |
Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 3 | import base64 |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 4 | import json |
Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 5 | import os |
| 6 | import tarfile |
| 7 | import io |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 8 | from time import sleep |
Alex | eb934de | 2022-10-06 13:49:30 -0500 | [diff] [blame] | 9 | from datetime import datetime |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 10 | |
Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 11 | from cfg_checker.common import logger_cli, logger |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 12 | from cfg_checker.common.exception import KubeException |
| 13 | |
| 14 | from cfg_checker.helpers.console_utils import Progress |
| 15 | from cfg_checker.helpers.tgz import TGZFile |
| 16 | from cfg_checker.nodes import KubeNodes |
| 17 | from cfg_checker.reports import reporter |
| 18 | |
| 19 | |
| 20 | class CephInfo(object): |
| 21 | def __init__( |
| 22 | self, |
| 23 | config |
| 24 | ): |
| 25 | self.env_config = config |
| 26 | return |
| 27 | |
Alex | eb934de | 2022-10-06 13:49:30 -0500 | [diff] [blame] | 28 | def get_info_archive_filename(self, client, project): |
| 29 | # prefill known data |
| 30 | _tags = ["CephCollectData"] |
| 31 | _tags.append(client) |
| 32 | _tags.append(project) |
| 33 | |
| 34 | # generate date for tgz |
| 35 | _file_datetime_fmt = "%Y-%m-%d" |
| 36 | _dt = datetime.now().strftime(_file_datetime_fmt) |
| 37 | _tags.append(_dt) |
| 38 | |
| 39 | # extension |
| 40 | _tags.append("tar") |
| 41 | _tags.append("gz") |
| 42 | return ".".join(_tags) |
| 43 | |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 44 | def get_transposed_latency_table(self): |
| 45 | _table = { |
| 46 | "<dev>": [] |
| 47 | } |
| 48 | for _pfd in self.ceph_info['osd_latency_data']['data']['data']: |
| 49 | _table["<dev>"].append({ |
| 50 | "formatted": " cL/aL ", |
| 51 | "commit_latency_ms": "Commit, ms", |
| 52 | "apply_latency_ms": "Apply, ms", |
| 53 | "commit_latency_ns": "Commit, ns", |
| 54 | "apply_latency_ns": "Apply, ns" |
| 55 | }) |
| 56 | for _f in _pfd['osdstats']['osd_perf_infos']: |
| 57 | _n = "osd_{}".format(_f['id']) |
| 58 | if _n not in _table: |
| 59 | _table[_n] = [] |
| 60 | _table[_n].append({ |
| 61 | "formatted": "{:>3}/{:<3}".format( |
| 62 | _f['perf_stats']['commit_latency_ms'], |
| 63 | _f['perf_stats']['apply_latency_ms'], |
| 64 | ), |
| 65 | "commit_latency_ms": _f['perf_stats']['commit_latency_ms'], |
| 66 | "apply_latency_ms": _f['perf_stats']['apply_latency_ms'], |
| 67 | "commit_latency_ns": _f['perf_stats']['commit_latency_ns'], |
| 68 | "apply_latency_ns": _f['perf_stats']['apply_latency_ns'] |
| 69 | }) |
| 70 | self.ceph_info['osd_latency_data']['table'] = _table |
| 71 | return _table |
| 72 | |
| 73 | def get_latest_health_readout(self): |
| 74 | _h = self.ceph_info['ceph_health']['data'] |
| 75 | self.ceph_info['ceph_health']['latest'] = {} |
| 76 | for _n, _d in _h.items(): |
| 77 | if not _d: |
| 78 | self.ceph_info['ceph_health']['latest'][_n] = {} |
| 79 | continue |
| 80 | else: |
| 81 | # TODO: Consider filtering out or prepare data for the table |
Alex | defbfeb | 2022-11-08 12:17:54 -0600 | [diff] [blame] | 82 | _osd = _d.pop("osd_name") if "osd_name" in _d else "unknown" |
| 83 | _node_name = _d.pop("node_name") \ |
| 84 | if "node_name" in _d else "unknown" |
Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 85 | # Additional check for empty data |
| 86 | if not _d: |
| 87 | self.ceph_info['ceph_health']['latest'][_n] = {} |
| 88 | continue |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 89 | _date = sorted(_d.keys(), reverse=True)[0] |
| 90 | self.ceph_info['ceph_health']['date'] = _date |
| 91 | self.ceph_info['ceph_health']['latest'][_n] = _d[_date] |
Alex | df9cc3a | 2021-10-12 14:37:28 -0500 | [diff] [blame] | 92 | self.ceph_info['ceph_health']['latest'][_n]["osd_name"] = _osd |
| 93 | self.ceph_info['ceph_health']['latest'][_n]["node_name"] = \ |
| 94 | _node_name |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 95 | |
| 96 | return self.ceph_info['ceph_health']['latest'] |
| 97 | |
| 98 | def print_summary(self): |
| 99 | logger_cli.info("\n# Ceph Cluster summary") |
| 100 | # Health status |
| 101 | _h = self.ceph_info['health_detail']['data'] |
| 102 | logger_cli.info("Cluster status: {}".format(_h['status'])) |
| 103 | for _chk, _d in _h['checks'].items(): |
| 104 | logger_cli.info( |
| 105 | "+ {}: {}\n\tSummary: {}".format( |
| 106 | _chk, |
| 107 | _d['severity'], |
| 108 | _d['summary']['message'] |
| 109 | ) |
| 110 | ) |
| 111 | logger_cli.info("\tDetails:") |
| 112 | for _item in _d['detail']: |
| 113 | logger_cli.info("\t '{}".format(_item['message'])) |
| 114 | |
| 115 | # OSD health metrics |
| 116 | logger_cli.info("\n# Device health metrics:") |
| 117 | _fmt = " {:45} {:^14} {:^9} {:^6} {:^6}" |
| 118 | logger_cli.info( |
| 119 | _fmt.format( |
| 120 | "Device Name", |
| 121 | "Info", |
| 122 | "Speed", |
| 123 | "SMART", |
| 124 | "Tempr." |
| 125 | ) |
| 126 | ) |
| 127 | _latest = self.get_latest_health_readout() |
| 128 | for _n, _d in _latest.items(): |
| 129 | if not _d: |
| 130 | logger_cli.info("{:45} {:<10}".format(_n, "<empty>")) |
| 131 | continue |
| 132 | |
Alex | df9cc3a | 2021-10-12 14:37:28 -0500 | [diff] [blame] | 133 | _status = _d['smart_status']['passed'] |
| 134 | if "interface_speed" in _d: |
| 135 | _speed = _d['interface_speed']['current']['string'] |
| 136 | else: |
| 137 | _speed = "-" |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 138 | |
| 139 | _status = 'passed' if _status else 'failed' |
| 140 | logger_cli.info( |
| 141 | _fmt.format( |
| 142 | _n, |
| 143 | _d['device']['info_name'], |
Alex | df9cc3a | 2021-10-12 14:37:28 -0500 | [diff] [blame] | 144 | _speed, |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 145 | _status, |
| 146 | _d['temperature']['current'] |
| 147 | ) |
| 148 | ) |
| 149 | |
| 150 | # Latency table |
| 151 | logger_cli.info( |
| 152 | "\n# OSD Latency data ({} iterations, {} sec delay), " |
| 153 | "table items 'osd_dev: N:cL/aL'\n" |
| 154 | " 'Commit Latency' -> 'cL', 'Apply Latency' -> 'aL'\n".format( |
| 155 | self.ceph_info['osd_latency_data']['data']['total'], |
| 156 | self.ceph_info['osd_latency_data']['data']['delay'] |
| 157 | ) |
| 158 | ) |
| 159 | _strs = self.get_transposed_latency_table() |
| 160 | for _osd, _list in _strs.items(): |
| 161 | _row = [c["formatted"] for c in _list] |
| 162 | logger_cli.info( |
| 163 | " {:8}: {}".format( |
| 164 | _osd, |
| 165 | " ".join(_row) |
| 166 | ) |
| 167 | ) |
| 168 | logger_cli.info("\n") |
| 169 | |
| 170 | # critical config values |
| 171 | # TODO: print/calculate config values |
| 172 | |
| 173 | return |
| 174 | |
| 175 | def dump_info(self): |
| 176 | with open('cephdump.json', 'wt') as _f: |
| 177 | _f.write(json.dumps(self.ceph_info, indent=2)) |
| 178 | |
| 179 | def load_info(self): |
| 180 | with open('cephdump.json', 'rt') as _f: |
| 181 | self.ceph_info = json.load(_f) |
| 182 | |
| 183 | def generate_archive(self, tgzfilename): |
Alex | eb934de | 2022-10-06 13:49:30 -0500 | [diff] [blame] | 184 | def _ensure_fname(ext): |
| 185 | return key + ext if _fname is None else _fname |
| 186 | |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 187 | if not self.ceph_info: |
| 188 | logger_cli.warning( |
| 189 | "WARNING: Ceph Info Data not detected. " |
| 190 | "Consider check for errors in log." |
| 191 | ) |
| 192 | else: |
| 193 | # Create Archive |
| 194 | logger_cli.info("-> Generating archive '{}'".format(tgzfilename)) |
| 195 | _tgz = TGZFile( |
| 196 | tgzfilename, |
| 197 | label="MCP Checker: Generated Ceph Information" |
| 198 | ) |
| 199 | # Iterate every key and write data to tar file |
| 200 | for key, d in self.ceph_info.items(): |
Alex | eb934de | 2022-10-06 13:49:30 -0500 | [diff] [blame] | 201 | _fname = None |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 202 | # Cast buf to a proper type |
| 203 | _buf = None |
Alex | eb934de | 2022-10-06 13:49:30 -0500 | [diff] [blame] | 204 | if "filename" in d: |
| 205 | _fname = d["filename"] |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 206 | if isinstance(d["data"], dict) or isinstance(d["data"], list): |
| 207 | _buf = json.dumps(d["data"], indent=2) |
Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 208 | # _filename = key+".json" if _fname is not None else _fname |
Alex | eb934de | 2022-10-06 13:49:30 -0500 | [diff] [blame] | 209 | _filename = _ensure_fname(".json") |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 210 | elif isinstance(d["data"], str): |
| 211 | _buf = d["data"] |
Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 212 | # _filename = key+".txt" |
Alex | eb934de | 2022-10-06 13:49:30 -0500 | [diff] [blame] | 213 | _filename = _ensure_fname(".txt") |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 214 | else: |
| 215 | _buf = str(d["data"]) |
Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 216 | # _filename = key+".txt" |
Alex | eb934de | 2022-10-06 13:49:30 -0500 | [diff] [blame] | 217 | _filename = _ensure_fname(".txt") |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 218 | logger_cli.debug("... writing '{}'".format(_filename)) |
| 219 | _tgz.add_file(_filename, buf=_buf, replace=True) |
| 220 | |
| 221 | return |
| 222 | |
| 223 | def create_html_report(self, filename): |
| 224 | """ |
| 225 | Create static html showing ceph info report |
| 226 | |
| 227 | :return: none |
| 228 | """ |
| 229 | logger_cli.info("### Generating report to '{}'".format(filename)) |
| 230 | _report = reporter.ReportToFile( |
| 231 | reporter.HTMLCephInfo(self), |
| 232 | filename |
| 233 | ) |
| 234 | _report( |
| 235 | { |
| 236 | "info": self.ceph_info, |
| 237 | "cluster": self.cluster_info, |
| 238 | "nodes": self.nodes, |
| 239 | "ceph_version": self.ceph_version, |
| 240 | } |
| 241 | ) |
| 242 | logger_cli.info("-> Done") |
| 243 | |
| 244 | return |
| 245 | |
| 246 | |
| 247 | class SaltCephInfo(CephInfo): |
| 248 | def __init__( |
| 249 | self, |
| 250 | config |
| 251 | ): |
| 252 | logger_cli.warning("\nWARNING: Not impelented for Salt environment!\n") |
| 253 | |
| 254 | # self.master = SaltNodes(config) |
| 255 | super(SaltCephInfo, self).__init__(config) |
| 256 | return |
| 257 | |
| 258 | |
| 259 | class KubeCephInfo(CephInfo): |
| 260 | ceph_ns = "rook-ceph" |
| 261 | ceph_app_label = "rook-ceph-tools" |
| 262 | ceph_group = "ceph.rook.io" |
| 263 | ceph_apiversion = "v1" |
| 264 | ceph_plural = "cephclusters" |
| 265 | ceph_version = "unknown" |
| 266 | |
| 267 | def __init__(self, config): |
| 268 | self.master = KubeNodes(config) |
| 269 | super(KubeCephInfo, self).__init__(config) |
| 270 | # Init ceph tools pod |
| 271 | self.pod_name = self._get_tools_pod_name() |
| 272 | self.ceph_info = {} |
| 273 | self.cluster_info = {} |
| 274 | self.ceph_version = self.get_ceph_cluster_config() |
| 275 | |
Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 276 | def _safe_tools_cmd(self, cmd_str, expect_output=True): |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 277 | _r = self.master.exec_cmd_on_target_pod( |
| 278 | self.pod_name, |
| 279 | self.ceph_ns, |
Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 280 | cmd_str |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 281 | ) |
| 282 | if expect_output and not _r: |
Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 283 | logger.debug("... got empty output for '{}'".format(cmd_str)) |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 284 | elif not expect_output and _r: |
Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 285 | logger.warning( |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 286 | "WARNING: Unexpected output for '{}':\n" |
Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 287 | "===== Start\n{}\n===== End".format(cmd_str, _r) |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 288 | ) |
| 289 | return _r |
| 290 | |
Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 291 | def _safe_tools_cmd_zipped_output(self, cmd_str): |
| 292 | # temp file |
| 293 | _tmp_path = "/tmp" |
| 294 | _filename = "checker_cmd_output" |
| 295 | _tar_path = os.path.join(_tmp_path, "checker_cmd.tgz") |
| 296 | _path = os.path.join(_tmp_path, _filename) |
| 297 | |
| 298 | # Run original cmd with redirect |
| 299 | _cmd = [cmd_str, "-o", _path] |
| 300 | self._safe_tools_cmd(" ".join(_cmd), expect_output=False) |
| 301 | # zip it and base64 encode |
| 302 | _cmd = ["tar", "-zcvf", _tar_path, _path] |
| 303 | self._safe_tools_cmd(" ".join(_cmd)) |
| 304 | _b64 = self._safe_tools_cmd("base64 " + _tar_path) |
| 305 | # decode and decompress |
| 306 | _io = io.BytesIO(base64.standard_b64decode(_b64)) |
| 307 | _json = "" |
| 308 | with tarfile.open(fileobj=_io) as _tar: |
| 309 | _tar_item = _tar.extractfile(_tar.getmembers()[0]) |
| 310 | _json = _tar_item.read() |
| 311 | # cleanup |
| 312 | self._safe_tools_cmd("rm -f " + _path) |
| 313 | self._safe_tools_cmd("rm -f " + _tar_path) |
| 314 | return _json |
| 315 | |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 316 | @staticmethod |
| 317 | def _as_json(buf): |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 318 | try: |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 319 | return json.loads(buf) |
Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 320 | except ValueError as e: |
| 321 | _out = "" |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 322 | if len(buf) > 512: |
| 323 | _out = buf[:512] |
Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 324 | _out += "..." |
| 325 | else: |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 326 | _out = buf |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 327 | logger_cli.error( |
Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 328 | "\nERROR: failed to parse json: '{}'. Data: '{}'".format( |
| 329 | e, |
| 330 | _out |
| 331 | ) |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 332 | ) |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 333 | return buf |
| 334 | |
| 335 | def _safe_get_cmd_output_as_json(self, cmd, zipped=False): |
| 336 | if zipped: |
| 337 | _buf = self._safe_tools_cmd_zipped_output(cmd) |
| 338 | else: |
| 339 | _buf = self._safe_tools_cmd(cmd) |
| 340 | return self._as_json(_buf) |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 341 | |
| 342 | def _get_tools_pod_name(self): |
| 343 | # get ceph pod |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 344 | _pods = self.master.kube.get_pods_by_partial_name( |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 345 | self.ceph_app_label, |
| 346 | self.ceph_ns |
| 347 | ) |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 348 | # _names = self.master.kube.get_pod_names_by_partial_name( |
| 349 | # self.ceph_app_label, |
| 350 | # self.ceph_ns |
| 351 | # ) |
| 352 | if not _pods: |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 353 | raise KubeException( |
| 354 | "Failed to find pod using '{}'".format(self.ceph_app_label) |
| 355 | ) |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 356 | elif len(_pods) > 1: |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 357 | logger_cli.warning( |
| 358 | "WARNING: Environment has more than one pod " |
| 359 | "with '{}' app: {}".format( |
| 360 | self.ceph_app_label, |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 361 | ", ".join([p.metadata.name for p in _pods]) |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 362 | ) |
| 363 | ) |
| 364 | else: |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 365 | logger_cli.debug("... found '{}'".format(_pods[0].metadata.name)) |
| 366 | self.ceph_pod = _pods[0] |
| 367 | return _pods[0].metadata.name |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 368 | |
Alex | eb934de | 2022-10-06 13:49:30 -0500 | [diff] [blame] | 369 | def _add_ceph_info_item(self, key, title, data, filename=None): |
| 370 | # handle data |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 371 | if key in self.ceph_info: |
| 372 | self.ceph_info[key]["title"] = title |
| 373 | self.ceph_info[key]["data"] = data |
| 374 | else: |
| 375 | self.ceph_info[key] = { |
| 376 | "title": title, |
| 377 | "data": data |
| 378 | } |
Alex | eb934de | 2022-10-06 13:49:30 -0500 | [diff] [blame] | 379 | if filename: |
| 380 | self.ceph_info[key]["filename"] = filename |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 381 | |
| 382 | def _parse_dev_classes(self, deviceClasses): |
| 383 | _devClasses = [] |
| 384 | for _i in deviceClasses: |
| 385 | _devClasses += list(_i.values()) |
| 386 | return set(_devClasses) |
| 387 | |
| 388 | def get_ceph_cluster_config(self): |
| 389 | # get cephclusters resource |
| 390 | logger_cli.info("# Loading '{}' object of type '{}/{}'".format( |
| 391 | self.ceph_plural, |
| 392 | self.ceph_group, |
| 393 | self.ceph_apiversion |
| 394 | )) |
| 395 | _r = self.master.kube.get_custom_resource( |
| 396 | self.ceph_group, |
| 397 | self.ceph_apiversion, |
| 398 | self.ceph_plural, |
| 399 | ) |
| 400 | # find cluster |
| 401 | _cluster = None |
| 402 | if len(_r['items']) < 1: |
| 403 | logger_cli.warning( |
| 404 | "WARNING: Failed to find '{}' ({}/{})".format( |
| 405 | self.ceph_plural, |
| 406 | self.ceph_group, |
| 407 | self.ceph_apiversion |
| 408 | ) |
| 409 | ) |
| 410 | return 'uknown' |
| 411 | elif len(_r['items']) > 1: |
| 412 | logger_cli.warning( |
| 413 | "WARNING: Multiple clusters found '{}' ({}/{})".format( |
| 414 | self.ceph_plural, |
| 415 | self.ceph_group, |
| 416 | self.ceph_apiversion |
| 417 | ) |
| 418 | ) |
| 419 | _cluster = _r['items'][0] |
| 420 | _s = _cluster['status'] |
| 421 | self.cluster_info.update({ |
| 422 | 'image': _s['version']['image'], |
| 423 | 'version': _s['version']['version'], |
| 424 | 'device_classes': self._parse_dev_classes( |
Alex | 0f9b265 | 2022-10-20 13:50:47 -0500 | [diff] [blame] | 425 | _s['storage'].get('deviceClasses', []) |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 426 | ), |
| 427 | 'phase': _s['phase'], |
| 428 | 'state': _s['state'], |
Alex | 0f9b265 | 2022-10-20 13:50:47 -0500 | [diff] [blame] | 429 | 'health': _s['ceph'].get('health', {}), |
| 430 | 'previousHealth': _s['ceph'].get('previousHealth', {}), |
| 431 | 'lastChanged': _s['ceph'].get('lastChanged', ""), |
| 432 | 'lastChecked': _s['ceph'].get('lastChecked', ""), |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 433 | 'mon_count': _cluster['spec']['mon']['count'] |
| 434 | }) |
| 435 | self.nodes = _cluster['spec']['storage']['nodes'], |
| 436 | logger_cli.info("-> Found Ceph cluster: {} ({})".format( |
| 437 | self.cluster_info['version'], |
| 438 | self.cluster_info['image'] |
| 439 | )) |
| 440 | return self.cluster_info['version'] |
| 441 | |
Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 442 | def get_cluster_status(self): |
| 443 | return self._safe_get_cmd_output_as_json("ceph -s -f json") |
| 444 | |
| 445 | def get_health_detail(self): |
| 446 | return self._safe_get_cmd_output_as_json("ceph -f json health detail") |
| 447 | |
| 448 | def get_ceph_df(self): |
| 449 | return self._safe_get_cmd_output_as_json("ceph df -f json") |
| 450 | |
| 451 | def get_ceph_pg_dump(self): |
Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 452 | return self._safe_get_cmd_output_as_json( |
| 453 | "ceph pg dump -f json", |
| 454 | zipped=True |
| 455 | ) |
Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 456 | |
| 457 | def get_ceph_osd_df(self): |
| 458 | return self._safe_get_cmd_output_as_json("ceph osd df -f json") |
| 459 | |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 460 | def gather_info(self): |
| 461 | logger_cli.info("# Gathering Ceph cluster info") |
| 462 | # Collect info |
| 463 | _c = self._safe_tools_cmd |
| 464 | _cj = self._safe_get_cmd_output_as_json |
| 465 | # Crush Map |
| 466 | logger_cli.info("-> Collecting CRUSH map") |
| 467 | _cmap_tmp_path = "/tmp/crushmap.bin" |
| 468 | _r = _c( |
| 469 | "ceph osd getcrushmap -o " + _cmap_tmp_path, |
| 470 | expect_output=False |
| 471 | ) |
| 472 | # TODO: Handle errors in _r |
| 473 | logger_cli.debug("... 'getcrushmap' return value is: '{}'".format(_r)) |
| 474 | |
| 475 | # Get Crush map as json and text |
| 476 | self._add_ceph_info_item( |
| 477 | "crushmap_json", |
| 478 | "Crush Map (json)", |
Alex | eb934de | 2022-10-06 13:49:30 -0500 | [diff] [blame] | 479 | _cj("crushtool -i " + _cmap_tmp_path + " --dump"), |
| 480 | filename="crushmap.json" |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 481 | ) |
| 482 | # _crushmap = _cj("crushtool -i " + _cmap_tmp_path + " --dump") |
| 483 | self._add_ceph_info_item( |
| 484 | "crushmap_text", |
| 485 | "Crush Map (text)", |
Alex | eb934de | 2022-10-06 13:49:30 -0500 | [diff] [blame] | 486 | _c("crushtool -d " + _cmap_tmp_path), |
| 487 | filename="crushmap.json" |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 488 | ) |
| 489 | |
| 490 | logger_cli.info("-> Collecting ceph osd crush dump") |
| 491 | self._add_ceph_info_item( |
| 492 | "osd_crushdump", |
| 493 | "Crush dump (osd)", |
| 494 | _cj("ceph osd crush dump") |
| 495 | ) |
| 496 | |
| 497 | logger_cli.info("-> Collecting cluster status") |
| 498 | self._add_ceph_info_item( |
| 499 | "cluster_status", |
| 500 | "Cluster status", |
Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 501 | self.get_cluster_status() |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 502 | ) |
| 503 | |
| 504 | logger_cli.info("-> Collecting health detail") |
| 505 | self._add_ceph_info_item( |
| 506 | "health_detail", |
| 507 | "Health details", |
Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 508 | self.get_health_detail() |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 509 | ) |
| 510 | |
| 511 | logger_cli.info("-> Collecting monmap") |
| 512 | self._add_ceph_info_item( |
| 513 | "monmap", |
| 514 | "Ceph Mon map", |
| 515 | _cj("ceph mon dump -f json") |
| 516 | ) |
| 517 | |
| 518 | logger_cli.info("-> Collecting ceph df") |
| 519 | self._add_ceph_info_item( |
| 520 | "ceph_df", |
| 521 | "Ceph DF", |
Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 522 | self.get_ceph_df() |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 523 | ) |
| 524 | |
| 525 | logger_cli.info("-> Collecting ceph osd df") |
| 526 | self._add_ceph_info_item( |
| 527 | "ceph_osd_df", |
| 528 | "Ceph OSD DF", |
Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 529 | self.get_ceph_osd_df() |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 530 | ) |
| 531 | |
| 532 | logger_cli.info("-> Collecting ceph osd dump") |
| 533 | self._add_ceph_info_item( |
| 534 | "ceph_osd_dump", |
| 535 | "Ceph OSD dump", |
| 536 | _cj("ceph osd dump -f json") |
| 537 | ) |
| 538 | |
| 539 | logger_cli.info("-> Collecting rados df") |
| 540 | self._add_ceph_info_item( |
| 541 | "rados_df", |
| 542 | "Rados DF", |
| 543 | _cj("rados df -f json") |
| 544 | ) |
| 545 | |
| 546 | logger_cli.info("-> Collecting ceph report") |
| 547 | self._add_ceph_info_item( |
| 548 | "ceph_report", |
| 549 | "Ceph Report", |
| 550 | _cj("ceph report") |
| 551 | ) |
| 552 | |
| 553 | logger_cli.info("-> Collecting auth data anonymized") |
| 554 | _auth_data = _cj("ceph auth list -f json") |
| 555 | # Anonymize data |
| 556 | # _cj("ceph auth list -f json | sed 's/AQ[^=]*==/KEY/g'") |
| 557 | for item in _auth_data["auth_dump"]: |
| 558 | if "key" in item: |
| 559 | item['key'] = "key-data-redacted" |
| 560 | self._add_ceph_info_item( |
| 561 | "ceph_auth_ls", |
| 562 | "Ceph Auth Data (anonymized)", |
| 563 | _auth_data |
| 564 | ) |
| 565 | |
| 566 | logger_cli.info("-> Collecting ceph pg dump") |
| 567 | self._add_ceph_info_item( |
| 568 | "ceph_pg_dump", |
| 569 | "Ceph PG dump", |
Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 570 | self.get_ceph_pg_dump() |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 571 | ) |
| 572 | |
| 573 | logger_cli.info("-> Collecting ceph running configuration") |
| 574 | self._add_ceph_info_item( |
| 575 | "ceph_config_dump", |
| 576 | "Ceph Configuration Dump", |
| 577 | _cj("ceph config dump -f json") |
| 578 | ) |
| 579 | |
| 580 | logger_cli.info("-> Collecting health metrics") |
| 581 | _health_metrics = {} |
| 582 | _devices = _c("ceph device ls") |
Alex | df9cc3a | 2021-10-12 14:37:28 -0500 | [diff] [blame] | 583 | _devices = _devices.splitlines() |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 584 | cmd_list = [] |
Alex | df9cc3a | 2021-10-12 14:37:28 -0500 | [diff] [blame] | 585 | for device in _devices: |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 586 | _t = device.split() |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 587 | _dev = _t[0] |
Alex | 336697e | 2023-05-22 16:49:32 -0500 | [diff] [blame] | 588 | _node = _t[1] if len(_t) > 1 else "unknown" |
| 589 | _osd = _t[2] if len(_t) > 2 else "unknown" |
| 590 | |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 591 | if _dev == "DEVICE": |
| 592 | continue |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 593 | # _metric = _cj("ceph device get-health-metrics {}".format(_dev)) |
| 594 | _cmd = "ceph device get-health-metrics {}".format(_dev) |
| 595 | cmd_list.append(_cmd) |
Alex | df9cc3a | 2021-10-12 14:37:28 -0500 | [diff] [blame] | 596 | _dev_name = "{}_{}".format(_osd, _dev) |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 597 | _health_metrics[_dev_name] = {} |
Alex | df9cc3a | 2021-10-12 14:37:28 -0500 | [diff] [blame] | 598 | _health_metrics[_dev_name]['node_name'] = _node |
| 599 | _health_metrics[_dev_name]['osd_name'] = _osd |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 600 | _health_metrics[_dev_name]['cmd'] = _cmd |
| 601 | |
| 602 | results = self.master.exec_cmds_on_pod( |
| 603 | self.ceph_pod, |
| 604 | cmd_list |
| 605 | ) |
| 606 | |
| 607 | logger_cli.info("-> Processing results") |
| 608 | for _r in results: |
| 609 | _cmd = _r[3] |
| 610 | _j = self._as_json(_r[2]) |
| 611 | for _dev_name in _health_metrics.keys(): |
| 612 | if "cmd" in _health_metrics[_dev_name] and \ |
| 613 | _health_metrics[_dev_name]["cmd"] == _cmd: |
| 614 | _health_metrics[_dev_name].update(_j) |
| 615 | _health_metrics[_dev_name].pop("cmd") |
| 616 | break |
| 617 | |
Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 618 | self._add_ceph_info_item( |
| 619 | "ceph_health", |
| 620 | "Ceph Health Metrics", |
| 621 | _health_metrics |
| 622 | ) |
| 623 | |
| 624 | # Latency values |
| 625 | # config const for set |
| 626 | _latency_count = 10 |
| 627 | _latency_delay = 4 |
| 628 | logger_cli.info( |
| 629 | "-> Collecting ceph osd latency data " |
| 630 | "({} total, {} sec delay)".format( |
| 631 | _latency_count, |
| 632 | _latency_delay |
| 633 | ) |
| 634 | ) |
| 635 | _osd_lat = { |
| 636 | "total": _latency_count, |
| 637 | "delay": _latency_delay, |
| 638 | "data": [] |
| 639 | } |
| 640 | _progress = Progress(_latency_count) |
| 641 | _index = 1 |
| 642 | while _index <= _latency_count: |
| 643 | _progress.write_progress(_index) |
| 644 | _osd_lat["data"].append(_cj("ceph osd perf -f json")) |
| 645 | sleep(_latency_delay) |
| 646 | _index += 1 |
| 647 | _progress.end() |
| 648 | self._add_ceph_info_item( |
| 649 | "osd_latency_data", |
| 650 | "OSD Latency metrics", |
| 651 | _osd_lat |
| 652 | ) |
| 653 | |
| 654 | return |
Alex | 41dd0cc | 2022-02-09 17:33:23 -0600 | [diff] [blame] | 655 | |
| 656 | def gather_osd_configs(self): |
| 657 | _total_osd = len(self.ceph_info["ceph_osd_df"]["data"]["nodes"]) |
| 658 | logger_cli.info( |
| 659 | "-> Gathering OSD configuration ({})".format(_total_osd) |
| 660 | ) |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 661 | cmds = {} |
| 662 | cmd_list = [] |
Alex | 41dd0cc | 2022-02-09 17:33:23 -0600 | [diff] [blame] | 663 | for _osd in self.ceph_info["ceph_osd_df"]["data"]["nodes"]: |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 664 | _cmd = "ceph config show-with-defaults -f json {}".format( |
| 665 | _osd["name"] |
Alex | 41dd0cc | 2022-02-09 17:33:23 -0600 | [diff] [blame] | 666 | ) |
Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 667 | cmd_list.append(_cmd) |
| 668 | cmds[_osd["name"]] = _cmd |
| 669 | |
| 670 | results = self.master.exec_cmds_on_pod( |
| 671 | self.ceph_pod, |
| 672 | cmd_list |
| 673 | ) |
| 674 | |
| 675 | logger_cli.info("-> Processing results") |
| 676 | _cfgs = {} |
| 677 | for _r in results: |
| 678 | _cmd = _r[3] |
| 679 | _j = self._as_json(_r[2]) |
| 680 | for _osd_name in cmds.keys(): |
| 681 | if cmds[_osd_name] == _cmd: |
| 682 | _cfgs[_osd_name] = _j |
| 683 | break |
Alex | 41dd0cc | 2022-02-09 17:33:23 -0600 | [diff] [blame] | 684 | |
| 685 | # Process configs |
| 686 | _base = {} |
| 687 | _uniq = {} |
| 688 | logger_cli.info("-> Filtering config values") |
| 689 | _progress = Progress(_total_osd) |
| 690 | _idx = 1 |
| 691 | for _osd, _data in _cfgs.items(): |
| 692 | _progress.write_progress(_idx, note=_osd) |
| 693 | for _o in _data: |
| 694 | _name = _o.pop("name") |
| 695 | if not _o["value"]: |
| 696 | _o["value"] = "-" |
| 697 | if _name not in _base: |
| 698 | _base[_name] = _o |
| 699 | elif _base[_name]["value"] != _o["value"]: |
| 700 | _progress.clearline() |
| 701 | logger_cli.info( |
| 702 | "...specific value for {} (src: '{}'): {}={}".format( |
| 703 | _osd, |
| 704 | _o["source"], |
| 705 | _name, |
| 706 | _o["value"] |
| 707 | ) |
| 708 | ) |
| 709 | _uniq[_osd] = { |
| 710 | _name: _o |
| 711 | } |
| 712 | _idx += 1 |
| 713 | _progress.end() |
| 714 | self._add_ceph_info_item( |
| 715 | "osd_config_data", |
| 716 | "OSD Configuration values", |
| 717 | { |
| 718 | "common": _base, |
| 719 | "uniq": _uniq |
| 720 | } |
| 721 | ) |
| 722 | return |