blob: 0eb1f150d2643a3a09a5e713f7a6efeb9c7acd8e [file] [log] [blame]
Alex0989ecf2022-03-29 13:43:21 -05001# Author: Alex Savatieiev (osavatieiev@mirantis.com; a.savex@gmail.com)
2# Copyright 2019-2022 Mirantis, Inc.
Alex90ac1532021-12-09 11:13:14 -06003import base64
Alexdcb792f2021-10-04 14:24:21 -05004import json
Alex90ac1532021-12-09 11:13:14 -06005import os
6import tarfile
7import io
Alexdcb792f2021-10-04 14:24:21 -05008from time import sleep
9
10
11from cfg_checker.common import logger_cli
12from cfg_checker.common.exception import KubeException
13
14from cfg_checker.helpers.console_utils import Progress
15from cfg_checker.helpers.tgz import TGZFile
16from cfg_checker.nodes import KubeNodes
17from cfg_checker.reports import reporter
18
19
20class CephInfo(object):
21 def __init__(
22 self,
23 config
24 ):
25 self.env_config = config
26 return
27
28 def get_transposed_latency_table(self):
29 _table = {
30 "<dev>": []
31 }
32 for _pfd in self.ceph_info['osd_latency_data']['data']['data']:
33 _table["<dev>"].append({
34 "formatted": " cL/aL ",
35 "commit_latency_ms": "Commit, ms",
36 "apply_latency_ms": "Apply, ms",
37 "commit_latency_ns": "Commit, ns",
38 "apply_latency_ns": "Apply, ns"
39 })
40 for _f in _pfd['osdstats']['osd_perf_infos']:
41 _n = "osd_{}".format(_f['id'])
42 if _n not in _table:
43 _table[_n] = []
44 _table[_n].append({
45 "formatted": "{:>3}/{:<3}".format(
46 _f['perf_stats']['commit_latency_ms'],
47 _f['perf_stats']['apply_latency_ms'],
48 ),
49 "commit_latency_ms": _f['perf_stats']['commit_latency_ms'],
50 "apply_latency_ms": _f['perf_stats']['apply_latency_ms'],
51 "commit_latency_ns": _f['perf_stats']['commit_latency_ns'],
52 "apply_latency_ns": _f['perf_stats']['apply_latency_ns']
53 })
54 self.ceph_info['osd_latency_data']['table'] = _table
55 return _table
56
57 def get_latest_health_readout(self):
58 _h = self.ceph_info['ceph_health']['data']
59 self.ceph_info['ceph_health']['latest'] = {}
60 for _n, _d in _h.items():
61 if not _d:
62 self.ceph_info['ceph_health']['latest'][_n] = {}
63 continue
64 else:
65 # TODO: Consider filtering out or prepare data for the table
Alexdf9cc3a2021-10-12 14:37:28 -050066 _osd = _d.pop("osd_name")
67 _node_name = _d.pop("node_name")
Alex90ac1532021-12-09 11:13:14 -060068 # Additional check for empty data
69 if not _d:
70 self.ceph_info['ceph_health']['latest'][_n] = {}
71 continue
Alexdcb792f2021-10-04 14:24:21 -050072 _date = sorted(_d.keys(), reverse=True)[0]
73 self.ceph_info['ceph_health']['date'] = _date
74 self.ceph_info['ceph_health']['latest'][_n] = _d[_date]
Alexdf9cc3a2021-10-12 14:37:28 -050075 self.ceph_info['ceph_health']['latest'][_n]["osd_name"] = _osd
76 self.ceph_info['ceph_health']['latest'][_n]["node_name"] = \
77 _node_name
Alexdcb792f2021-10-04 14:24:21 -050078
79 return self.ceph_info['ceph_health']['latest']
80
81 def print_summary(self):
82 logger_cli.info("\n# Ceph Cluster summary")
83 # Health status
84 _h = self.ceph_info['health_detail']['data']
85 logger_cli.info("Cluster status: {}".format(_h['status']))
86 for _chk, _d in _h['checks'].items():
87 logger_cli.info(
88 "+ {}: {}\n\tSummary: {}".format(
89 _chk,
90 _d['severity'],
91 _d['summary']['message']
92 )
93 )
94 logger_cli.info("\tDetails:")
95 for _item in _d['detail']:
96 logger_cli.info("\t '{}".format(_item['message']))
97
98 # OSD health metrics
99 logger_cli.info("\n# Device health metrics:")
100 _fmt = " {:45} {:^14} {:^9} {:^6} {:^6}"
101 logger_cli.info(
102 _fmt.format(
103 "Device Name",
104 "Info",
105 "Speed",
106 "SMART",
107 "Tempr."
108 )
109 )
110 _latest = self.get_latest_health_readout()
111 for _n, _d in _latest.items():
112 if not _d:
113 logger_cli.info("{:45} {:<10}".format(_n, "<empty>"))
114 continue
115
Alexdf9cc3a2021-10-12 14:37:28 -0500116 _status = _d['smart_status']['passed']
117 if "interface_speed" in _d:
118 _speed = _d['interface_speed']['current']['string']
119 else:
120 _speed = "-"
Alexdcb792f2021-10-04 14:24:21 -0500121
122 _status = 'passed' if _status else 'failed'
123 logger_cli.info(
124 _fmt.format(
125 _n,
126 _d['device']['info_name'],
Alexdf9cc3a2021-10-12 14:37:28 -0500127 _speed,
Alexdcb792f2021-10-04 14:24:21 -0500128 _status,
129 _d['temperature']['current']
130 )
131 )
132
133 # Latency table
134 logger_cli.info(
135 "\n# OSD Latency data ({} iterations, {} sec delay), "
136 "table items 'osd_dev: N:cL/aL'\n"
137 " 'Commit Latency' -> 'cL', 'Apply Latency' -> 'aL'\n".format(
138 self.ceph_info['osd_latency_data']['data']['total'],
139 self.ceph_info['osd_latency_data']['data']['delay']
140 )
141 )
142 _strs = self.get_transposed_latency_table()
143 for _osd, _list in _strs.items():
144 _row = [c["formatted"] for c in _list]
145 logger_cli.info(
146 " {:8}: {}".format(
147 _osd,
148 " ".join(_row)
149 )
150 )
151 logger_cli.info("\n")
152
153 # critical config values
154 # TODO: print/calculate config values
155
156 return
157
158 def dump_info(self):
159 with open('cephdump.json', 'wt') as _f:
160 _f.write(json.dumps(self.ceph_info, indent=2))
161
162 def load_info(self):
163 with open('cephdump.json', 'rt') as _f:
164 self.ceph_info = json.load(_f)
165
166 def generate_archive(self, tgzfilename):
167 if not self.ceph_info:
168 logger_cli.warning(
169 "WARNING: Ceph Info Data not detected. "
170 "Consider check for errors in log."
171 )
172 else:
173 # Create Archive
174 logger_cli.info("-> Generating archive '{}'".format(tgzfilename))
175 _tgz = TGZFile(
176 tgzfilename,
177 label="MCP Checker: Generated Ceph Information"
178 )
179 # Iterate every key and write data to tar file
180 for key, d in self.ceph_info.items():
181 _filename = None
182 # Cast buf to a proper type
183 _buf = None
184 if isinstance(d["data"], dict) or isinstance(d["data"], list):
185 _buf = json.dumps(d["data"], indent=2)
186 _filename = key + ".json"
187 elif isinstance(d["data"], str):
188 _buf = d["data"]
189 _filename = key + ".txt"
190 else:
191 _buf = str(d["data"])
192 _filename = key + ".txt"
193 logger_cli.debug("... writing '{}'".format(_filename))
194 _tgz.add_file(_filename, buf=_buf, replace=True)
195
196 return
197
198 def create_html_report(self, filename):
199 """
200 Create static html showing ceph info report
201
202 :return: none
203 """
204 logger_cli.info("### Generating report to '{}'".format(filename))
205 _report = reporter.ReportToFile(
206 reporter.HTMLCephInfo(self),
207 filename
208 )
209 _report(
210 {
211 "info": self.ceph_info,
212 "cluster": self.cluster_info,
213 "nodes": self.nodes,
214 "ceph_version": self.ceph_version,
215 }
216 )
217 logger_cli.info("-> Done")
218
219 return
220
221
222class SaltCephInfo(CephInfo):
223 def __init__(
224 self,
225 config
226 ):
227 logger_cli.warning("\nWARNING: Not impelented for Salt environment!\n")
228
229 # self.master = SaltNodes(config)
230 super(SaltCephInfo, self).__init__(config)
231 return
232
233
234class KubeCephInfo(CephInfo):
235 ceph_ns = "rook-ceph"
236 ceph_app_label = "rook-ceph-tools"
237 ceph_group = "ceph.rook.io"
238 ceph_apiversion = "v1"
239 ceph_plural = "cephclusters"
240 ceph_version = "unknown"
241
242 def __init__(self, config):
243 self.master = KubeNodes(config)
244 super(KubeCephInfo, self).__init__(config)
245 # Init ceph tools pod
246 self.pod_name = self._get_tools_pod_name()
247 self.ceph_info = {}
248 self.cluster_info = {}
249 self.ceph_version = self.get_ceph_cluster_config()
250
Alex90ac1532021-12-09 11:13:14 -0600251 def _safe_tools_cmd(self, cmd_str, expect_output=True):
Alexdcb792f2021-10-04 14:24:21 -0500252 _r = self.master.exec_cmd_on_target_pod(
253 self.pod_name,
254 self.ceph_ns,
Alex90ac1532021-12-09 11:13:14 -0600255 cmd_str
Alexdcb792f2021-10-04 14:24:21 -0500256 )
257 if expect_output and not _r:
Alex90ac1532021-12-09 11:13:14 -0600258 logger_cli.debug("... got empty output for '{}'".format(cmd_str))
Alexdcb792f2021-10-04 14:24:21 -0500259 elif not expect_output and _r:
260 logger_cli.warning(
261 "WARNING: Unexpected output for '{}':\n"
Alex90ac1532021-12-09 11:13:14 -0600262 "===== Start\n{}\n===== End".format(cmd_str, _r)
Alexdcb792f2021-10-04 14:24:21 -0500263 )
264 return _r
265
Alex90ac1532021-12-09 11:13:14 -0600266 def _safe_tools_cmd_zipped_output(self, cmd_str):
267 # temp file
268 _tmp_path = "/tmp"
269 _filename = "checker_cmd_output"
270 _tar_path = os.path.join(_tmp_path, "checker_cmd.tgz")
271 _path = os.path.join(_tmp_path, _filename)
272
273 # Run original cmd with redirect
274 _cmd = [cmd_str, "-o", _path]
275 self._safe_tools_cmd(" ".join(_cmd), expect_output=False)
276 # zip it and base64 encode
277 _cmd = ["tar", "-zcvf", _tar_path, _path]
278 self._safe_tools_cmd(" ".join(_cmd))
279 _b64 = self._safe_tools_cmd("base64 " + _tar_path)
280 # decode and decompress
281 _io = io.BytesIO(base64.standard_b64decode(_b64))
282 _json = ""
283 with tarfile.open(fileobj=_io) as _tar:
284 _tar_item = _tar.extractfile(_tar.getmembers()[0])
285 _json = _tar_item.read()
286 # cleanup
287 self._safe_tools_cmd("rm -f " + _path)
288 self._safe_tools_cmd("rm -f " + _tar_path)
289 return _json
290
291 def _safe_get_cmd_output_as_json(self, cmd, zipped=False):
292 if zipped:
293 _buf = self._safe_tools_cmd_zipped_output(cmd)
294 else:
295 _buf = self._safe_tools_cmd(cmd)
Alexdcb792f2021-10-04 14:24:21 -0500296 try:
297 return json.loads(_buf)
Alex90ac1532021-12-09 11:13:14 -0600298 except ValueError as e:
299 _out = ""
300 if len(_buf) > 512:
301 _out = _buf[:512]
302 _out += "..."
303 else:
304 _out = _buf
Alexdcb792f2021-10-04 14:24:21 -0500305 logger_cli.error(
Alex90ac1532021-12-09 11:13:14 -0600306 "\nERROR: failed to parse json: '{}'. Data: '{}'".format(
307 e,
308 _out
309 )
Alexdcb792f2021-10-04 14:24:21 -0500310 )
311 return _buf
312
313 def _get_tools_pod_name(self):
314 # get ceph pod
315 _names = self.master.kube.get_pod_names_by_partial_name(
316 self.ceph_app_label,
317 self.ceph_ns
318 )
319 if not _names:
320 raise KubeException(
321 "Failed to find pod using '{}'".format(self.ceph_app_label)
322 )
323 elif len(_names) > 1:
324 logger_cli.warning(
325 "WARNING: Environment has more than one pod "
326 "with '{}' app: {}".format(
327 self.ceph_app_label,
328 ", ".join(_names)
329 )
330 )
331 else:
332 logger_cli.debug("... found '{}'".format(_names[0]))
333 return _names[0]
334
335 def _add_ceph_info_item(self, key, title, data):
336 if key in self.ceph_info:
337 self.ceph_info[key]["title"] = title
338 self.ceph_info[key]["data"] = data
339 else:
340 self.ceph_info[key] = {
341 "title": title,
342 "data": data
343 }
344
345 def _parse_dev_classes(self, deviceClasses):
346 _devClasses = []
347 for _i in deviceClasses:
348 _devClasses += list(_i.values())
349 return set(_devClasses)
350
351 def get_ceph_cluster_config(self):
352 # get cephclusters resource
353 logger_cli.info("# Loading '{}' object of type '{}/{}'".format(
354 self.ceph_plural,
355 self.ceph_group,
356 self.ceph_apiversion
357 ))
358 _r = self.master.kube.get_custom_resource(
359 self.ceph_group,
360 self.ceph_apiversion,
361 self.ceph_plural,
362 )
363 # find cluster
364 _cluster = None
365 if len(_r['items']) < 1:
366 logger_cli.warning(
367 "WARNING: Failed to find '{}' ({}/{})".format(
368 self.ceph_plural,
369 self.ceph_group,
370 self.ceph_apiversion
371 )
372 )
373 return 'uknown'
374 elif len(_r['items']) > 1:
375 logger_cli.warning(
376 "WARNING: Multiple clusters found '{}' ({}/{})".format(
377 self.ceph_plural,
378 self.ceph_group,
379 self.ceph_apiversion
380 )
381 )
382 _cluster = _r['items'][0]
383 _s = _cluster['status']
384 self.cluster_info.update({
385 'image': _s['version']['image'],
386 'version': _s['version']['version'],
387 'device_classes': self._parse_dev_classes(
388 _s['storage']['deviceClasses']
389 ),
390 'phase': _s['phase'],
391 'state': _s['state'],
392 'health': _s['ceph']['health'],
393 'previousHealth': _s['ceph']['previousHealth'],
394 'lastChanged': _s['ceph']['lastChanged'],
395 'lastChecked': _s['ceph']['lastChecked'],
396 'mon_count': _cluster['spec']['mon']['count']
397 })
398 self.nodes = _cluster['spec']['storage']['nodes'],
399 logger_cli.info("-> Found Ceph cluster: {} ({})".format(
400 self.cluster_info['version'],
401 self.cluster_info['image']
402 ))
403 return self.cluster_info['version']
404
Alexb2129542021-11-23 15:49:42 -0600405 def get_cluster_status(self):
406 return self._safe_get_cmd_output_as_json("ceph -s -f json")
407
408 def get_health_detail(self):
409 return self._safe_get_cmd_output_as_json("ceph -f json health detail")
410
411 def get_ceph_df(self):
412 return self._safe_get_cmd_output_as_json("ceph df -f json")
413
414 def get_ceph_pg_dump(self):
Alex90ac1532021-12-09 11:13:14 -0600415 return self._safe_get_cmd_output_as_json(
416 "ceph pg dump -f json",
417 zipped=True
418 )
Alexb2129542021-11-23 15:49:42 -0600419
420 def get_ceph_osd_df(self):
421 return self._safe_get_cmd_output_as_json("ceph osd df -f json")
422
Alexdcb792f2021-10-04 14:24:21 -0500423 def gather_info(self):
424 logger_cli.info("# Gathering Ceph cluster info")
425 # Collect info
426 _c = self._safe_tools_cmd
427 _cj = self._safe_get_cmd_output_as_json
428 # Crush Map
429 logger_cli.info("-> Collecting CRUSH map")
430 _cmap_tmp_path = "/tmp/crushmap.bin"
431 _r = _c(
432 "ceph osd getcrushmap -o " + _cmap_tmp_path,
433 expect_output=False
434 )
435 # TODO: Handle errors in _r
436 logger_cli.debug("... 'getcrushmap' return value is: '{}'".format(_r))
437
438 # Get Crush map as json and text
439 self._add_ceph_info_item(
440 "crushmap_json",
441 "Crush Map (json)",
442 _cj("crushtool -i " + _cmap_tmp_path + " --dump")
443 )
444 # _crushmap = _cj("crushtool -i " + _cmap_tmp_path + " --dump")
445 self._add_ceph_info_item(
446 "crushmap_text",
447 "Crush Map (text)",
448 _c("crushtool -d " + _cmap_tmp_path)
449 )
450
451 logger_cli.info("-> Collecting ceph osd crush dump")
452 self._add_ceph_info_item(
453 "osd_crushdump",
454 "Crush dump (osd)",
455 _cj("ceph osd crush dump")
456 )
457
458 logger_cli.info("-> Collecting cluster status")
459 self._add_ceph_info_item(
460 "cluster_status",
461 "Cluster status",
Alexb2129542021-11-23 15:49:42 -0600462 self.get_cluster_status()
Alexdcb792f2021-10-04 14:24:21 -0500463 )
464
465 logger_cli.info("-> Collecting health detail")
466 self._add_ceph_info_item(
467 "health_detail",
468 "Health details",
Alexb2129542021-11-23 15:49:42 -0600469 self.get_health_detail()
Alexdcb792f2021-10-04 14:24:21 -0500470 )
471
472 logger_cli.info("-> Collecting monmap")
473 self._add_ceph_info_item(
474 "monmap",
475 "Ceph Mon map",
476 _cj("ceph mon dump -f json")
477 )
478
479 logger_cli.info("-> Collecting ceph df")
480 self._add_ceph_info_item(
481 "ceph_df",
482 "Ceph DF",
Alexb2129542021-11-23 15:49:42 -0600483 self.get_ceph_df()
Alexdcb792f2021-10-04 14:24:21 -0500484 )
485
486 logger_cli.info("-> Collecting ceph osd df")
487 self._add_ceph_info_item(
488 "ceph_osd_df",
489 "Ceph OSD DF",
Alexb2129542021-11-23 15:49:42 -0600490 self.get_ceph_osd_df()
Alexdcb792f2021-10-04 14:24:21 -0500491 )
492
493 logger_cli.info("-> Collecting ceph osd dump")
494 self._add_ceph_info_item(
495 "ceph_osd_dump",
496 "Ceph OSD dump",
497 _cj("ceph osd dump -f json")
498 )
499
500 logger_cli.info("-> Collecting rados df")
501 self._add_ceph_info_item(
502 "rados_df",
503 "Rados DF",
504 _cj("rados df -f json")
505 )
506
507 logger_cli.info("-> Collecting ceph report")
508 self._add_ceph_info_item(
509 "ceph_report",
510 "Ceph Report",
511 _cj("ceph report")
512 )
513
514 logger_cli.info("-> Collecting auth data anonymized")
515 _auth_data = _cj("ceph auth list -f json")
516 # Anonymize data
517 # _cj("ceph auth list -f json | sed 's/AQ[^=]*==/KEY/g'")
518 for item in _auth_data["auth_dump"]:
519 if "key" in item:
520 item['key'] = "key-data-redacted"
521 self._add_ceph_info_item(
522 "ceph_auth_ls",
523 "Ceph Auth Data (anonymized)",
524 _auth_data
525 )
526
527 logger_cli.info("-> Collecting ceph pg dump")
528 self._add_ceph_info_item(
529 "ceph_pg_dump",
530 "Ceph PG dump",
Alexb2129542021-11-23 15:49:42 -0600531 self.get_ceph_pg_dump()
Alexdcb792f2021-10-04 14:24:21 -0500532 )
533
534 logger_cli.info("-> Collecting ceph running configuration")
535 self._add_ceph_info_item(
536 "ceph_config_dump",
537 "Ceph Configuration Dump",
538 _cj("ceph config dump -f json")
539 )
540
541 logger_cli.info("-> Collecting health metrics")
542 _health_metrics = {}
543 _devices = _c("ceph device ls")
Alexdf9cc3a2021-10-12 14:37:28 -0500544 _devices = _devices.splitlines()
545 _progress = Progress(len(_devices)-1)
546 _index = 1
547 for device in _devices:
Alexdcb792f2021-10-04 14:24:21 -0500548 _t = device.split()
549 _osd = _t[2]
Alexdf9cc3a2021-10-12 14:37:28 -0500550 _node = _t[1]
Alexdcb792f2021-10-04 14:24:21 -0500551 _dev = _t[0]
552 if _dev == "DEVICE":
553 continue
554 _metric = _cj("ceph device get-health-metrics {}".format(_dev))
Alexdf9cc3a2021-10-12 14:37:28 -0500555 _dev_name = "{}_{}".format(_osd, _dev)
556 _health_metrics[_dev_name] = _metric
557 _health_metrics[_dev_name]['node_name'] = _node
558 _health_metrics[_dev_name]['osd_name'] = _osd
559 _progress.write_progress(_index, note=_dev_name)
560 _index += 1
561 _progress.end()
Alexdcb792f2021-10-04 14:24:21 -0500562 self._add_ceph_info_item(
563 "ceph_health",
564 "Ceph Health Metrics",
565 _health_metrics
566 )
567
568 # Latency values
569 # config const for set
570 _latency_count = 10
571 _latency_delay = 4
572 logger_cli.info(
573 "-> Collecting ceph osd latency data "
574 "({} total, {} sec delay)".format(
575 _latency_count,
576 _latency_delay
577 )
578 )
579 _osd_lat = {
580 "total": _latency_count,
581 "delay": _latency_delay,
582 "data": []
583 }
584 _progress = Progress(_latency_count)
585 _index = 1
586 while _index <= _latency_count:
587 _progress.write_progress(_index)
588 _osd_lat["data"].append(_cj("ceph osd perf -f json"))
589 sleep(_latency_delay)
590 _index += 1
591 _progress.end()
592 self._add_ceph_info_item(
593 "osd_latency_data",
594 "OSD Latency metrics",
595 _osd_lat
596 )
597
598 return
Alex41dd0cc2022-02-09 17:33:23 -0600599
600 def gather_osd_configs(self):
601 _total_osd = len(self.ceph_info["ceph_osd_df"]["data"]["nodes"])
602 logger_cli.info(
603 "-> Gathering OSD configuration ({})".format(_total_osd)
604 )
605 # Shortcuts
606 # _c = self._safe_tools_cmd
607 _cj = self._safe_get_cmd_output_as_json
608 _progress = Progress(_total_osd)
609 _idx = 1
610 _cfgs = {}
611 for _osd in self.ceph_info["ceph_osd_df"]["data"]["nodes"]:
612 _progress.write_progress(_idx, note=_osd["name"])
613 _cfgs[_osd["name"]] = _cj(
614 "ceph config show-with-defaults -f json {}".format(
615 _osd["name"]
616 )
617 )
618 _idx += 1
619 _progress.end()
620
621 # Process configs
622 _base = {}
623 _uniq = {}
624 logger_cli.info("-> Filtering config values")
625 _progress = Progress(_total_osd)
626 _idx = 1
627 for _osd, _data in _cfgs.items():
628 _progress.write_progress(_idx, note=_osd)
629 for _o in _data:
630 _name = _o.pop("name")
631 if not _o["value"]:
632 _o["value"] = "-"
633 if _name not in _base:
634 _base[_name] = _o
635 elif _base[_name]["value"] != _o["value"]:
636 _progress.clearline()
637 logger_cli.info(
638 "...specific value for {} (src: '{}'): {}={}".format(
639 _osd,
640 _o["source"],
641 _name,
642 _o["value"]
643 )
644 )
645 _uniq[_osd] = {
646 _name: _o
647 }
648 _idx += 1
649 _progress.end()
650 self._add_ceph_info_item(
651 "osd_config_data",
652 "OSD Configuration values",
653 {
654 "common": _base,
655 "uniq": _uniq
656 }
657 )
658 return