blob: f6d5758ef77a6ecfb185e81cd055012caa401564 [file] [log] [blame]
Alex90ac1532021-12-09 11:13:14 -06001import base64
Alexdcb792f2021-10-04 14:24:21 -05002import json
Alex90ac1532021-12-09 11:13:14 -06003import os
4import tarfile
5import io
Alexdcb792f2021-10-04 14:24:21 -05006from time import sleep
7
8
9from cfg_checker.common import logger_cli
10from cfg_checker.common.exception import KubeException
11
12from cfg_checker.helpers.console_utils import Progress
13from cfg_checker.helpers.tgz import TGZFile
14from cfg_checker.nodes import KubeNodes
15from cfg_checker.reports import reporter
16
17
18class CephInfo(object):
19 def __init__(
20 self,
21 config
22 ):
23 self.env_config = config
24 return
25
26 def get_transposed_latency_table(self):
27 _table = {
28 "<dev>": []
29 }
30 for _pfd in self.ceph_info['osd_latency_data']['data']['data']:
31 _table["<dev>"].append({
32 "formatted": " cL/aL ",
33 "commit_latency_ms": "Commit, ms",
34 "apply_latency_ms": "Apply, ms",
35 "commit_latency_ns": "Commit, ns",
36 "apply_latency_ns": "Apply, ns"
37 })
38 for _f in _pfd['osdstats']['osd_perf_infos']:
39 _n = "osd_{}".format(_f['id'])
40 if _n not in _table:
41 _table[_n] = []
42 _table[_n].append({
43 "formatted": "{:>3}/{:<3}".format(
44 _f['perf_stats']['commit_latency_ms'],
45 _f['perf_stats']['apply_latency_ms'],
46 ),
47 "commit_latency_ms": _f['perf_stats']['commit_latency_ms'],
48 "apply_latency_ms": _f['perf_stats']['apply_latency_ms'],
49 "commit_latency_ns": _f['perf_stats']['commit_latency_ns'],
50 "apply_latency_ns": _f['perf_stats']['apply_latency_ns']
51 })
52 self.ceph_info['osd_latency_data']['table'] = _table
53 return _table
54
55 def get_latest_health_readout(self):
56 _h = self.ceph_info['ceph_health']['data']
57 self.ceph_info['ceph_health']['latest'] = {}
58 for _n, _d in _h.items():
59 if not _d:
60 self.ceph_info['ceph_health']['latest'][_n] = {}
61 continue
62 else:
63 # TODO: Consider filtering out or prepare data for the table
Alexdf9cc3a2021-10-12 14:37:28 -050064 _osd = _d.pop("osd_name")
65 _node_name = _d.pop("node_name")
Alex90ac1532021-12-09 11:13:14 -060066 # Additional check for empty data
67 if not _d:
68 self.ceph_info['ceph_health']['latest'][_n] = {}
69 continue
Alexdcb792f2021-10-04 14:24:21 -050070 _date = sorted(_d.keys(), reverse=True)[0]
71 self.ceph_info['ceph_health']['date'] = _date
72 self.ceph_info['ceph_health']['latest'][_n] = _d[_date]
Alexdf9cc3a2021-10-12 14:37:28 -050073 self.ceph_info['ceph_health']['latest'][_n]["osd_name"] = _osd
74 self.ceph_info['ceph_health']['latest'][_n]["node_name"] = \
75 _node_name
Alexdcb792f2021-10-04 14:24:21 -050076
77 return self.ceph_info['ceph_health']['latest']
78
79 def print_summary(self):
80 logger_cli.info("\n# Ceph Cluster summary")
81 # Health status
82 _h = self.ceph_info['health_detail']['data']
83 logger_cli.info("Cluster status: {}".format(_h['status']))
84 for _chk, _d in _h['checks'].items():
85 logger_cli.info(
86 "+ {}: {}\n\tSummary: {}".format(
87 _chk,
88 _d['severity'],
89 _d['summary']['message']
90 )
91 )
92 logger_cli.info("\tDetails:")
93 for _item in _d['detail']:
94 logger_cli.info("\t '{}".format(_item['message']))
95
96 # OSD health metrics
97 logger_cli.info("\n# Device health metrics:")
98 _fmt = " {:45} {:^14} {:^9} {:^6} {:^6}"
99 logger_cli.info(
100 _fmt.format(
101 "Device Name",
102 "Info",
103 "Speed",
104 "SMART",
105 "Tempr."
106 )
107 )
108 _latest = self.get_latest_health_readout()
109 for _n, _d in _latest.items():
110 if not _d:
111 logger_cli.info("{:45} {:<10}".format(_n, "<empty>"))
112 continue
113
Alexdf9cc3a2021-10-12 14:37:28 -0500114 _status = _d['smart_status']['passed']
115 if "interface_speed" in _d:
116 _speed = _d['interface_speed']['current']['string']
117 else:
118 _speed = "-"
Alexdcb792f2021-10-04 14:24:21 -0500119
120 _status = 'passed' if _status else 'failed'
121 logger_cli.info(
122 _fmt.format(
123 _n,
124 _d['device']['info_name'],
Alexdf9cc3a2021-10-12 14:37:28 -0500125 _speed,
Alexdcb792f2021-10-04 14:24:21 -0500126 _status,
127 _d['temperature']['current']
128 )
129 )
130
131 # Latency table
132 logger_cli.info(
133 "\n# OSD Latency data ({} iterations, {} sec delay), "
134 "table items 'osd_dev: N:cL/aL'\n"
135 " 'Commit Latency' -> 'cL', 'Apply Latency' -> 'aL'\n".format(
136 self.ceph_info['osd_latency_data']['data']['total'],
137 self.ceph_info['osd_latency_data']['data']['delay']
138 )
139 )
140 _strs = self.get_transposed_latency_table()
141 for _osd, _list in _strs.items():
142 _row = [c["formatted"] for c in _list]
143 logger_cli.info(
144 " {:8}: {}".format(
145 _osd,
146 " ".join(_row)
147 )
148 )
149 logger_cli.info("\n")
150
151 # critical config values
152 # TODO: print/calculate config values
153
154 return
155
156 def dump_info(self):
157 with open('cephdump.json', 'wt') as _f:
158 _f.write(json.dumps(self.ceph_info, indent=2))
159
160 def load_info(self):
161 with open('cephdump.json', 'rt') as _f:
162 self.ceph_info = json.load(_f)
163
164 def generate_archive(self, tgzfilename):
165 if not self.ceph_info:
166 logger_cli.warning(
167 "WARNING: Ceph Info Data not detected. "
168 "Consider check for errors in log."
169 )
170 else:
171 # Create Archive
172 logger_cli.info("-> Generating archive '{}'".format(tgzfilename))
173 _tgz = TGZFile(
174 tgzfilename,
175 label="MCP Checker: Generated Ceph Information"
176 )
177 # Iterate every key and write data to tar file
178 for key, d in self.ceph_info.items():
179 _filename = None
180 # Cast buf to a proper type
181 _buf = None
182 if isinstance(d["data"], dict) or isinstance(d["data"], list):
183 _buf = json.dumps(d["data"], indent=2)
184 _filename = key + ".json"
185 elif isinstance(d["data"], str):
186 _buf = d["data"]
187 _filename = key + ".txt"
188 else:
189 _buf = str(d["data"])
190 _filename = key + ".txt"
191 logger_cli.debug("... writing '{}'".format(_filename))
192 _tgz.add_file(_filename, buf=_buf, replace=True)
193
194 return
195
196 def create_html_report(self, filename):
197 """
198 Create static html showing ceph info report
199
200 :return: none
201 """
202 logger_cli.info("### Generating report to '{}'".format(filename))
203 _report = reporter.ReportToFile(
204 reporter.HTMLCephInfo(self),
205 filename
206 )
207 _report(
208 {
209 "info": self.ceph_info,
210 "cluster": self.cluster_info,
211 "nodes": self.nodes,
212 "ceph_version": self.ceph_version,
213 }
214 )
215 logger_cli.info("-> Done")
216
217 return
218
219
220class SaltCephInfo(CephInfo):
221 def __init__(
222 self,
223 config
224 ):
225 logger_cli.warning("\nWARNING: Not impelented for Salt environment!\n")
226
227 # self.master = SaltNodes(config)
228 super(SaltCephInfo, self).__init__(config)
229 return
230
231
232class KubeCephInfo(CephInfo):
233 ceph_ns = "rook-ceph"
234 ceph_app_label = "rook-ceph-tools"
235 ceph_group = "ceph.rook.io"
236 ceph_apiversion = "v1"
237 ceph_plural = "cephclusters"
238 ceph_version = "unknown"
239
240 def __init__(self, config):
241 self.master = KubeNodes(config)
242 super(KubeCephInfo, self).__init__(config)
243 # Init ceph tools pod
244 self.pod_name = self._get_tools_pod_name()
245 self.ceph_info = {}
246 self.cluster_info = {}
247 self.ceph_version = self.get_ceph_cluster_config()
248
Alex90ac1532021-12-09 11:13:14 -0600249 def _safe_tools_cmd(self, cmd_str, expect_output=True):
Alexdcb792f2021-10-04 14:24:21 -0500250 _r = self.master.exec_cmd_on_target_pod(
251 self.pod_name,
252 self.ceph_ns,
Alex90ac1532021-12-09 11:13:14 -0600253 cmd_str
Alexdcb792f2021-10-04 14:24:21 -0500254 )
255 if expect_output and not _r:
Alex90ac1532021-12-09 11:13:14 -0600256 logger_cli.debug("... got empty output for '{}'".format(cmd_str))
Alexdcb792f2021-10-04 14:24:21 -0500257 elif not expect_output and _r:
258 logger_cli.warning(
259 "WARNING: Unexpected output for '{}':\n"
Alex90ac1532021-12-09 11:13:14 -0600260 "===== Start\n{}\n===== End".format(cmd_str, _r)
Alexdcb792f2021-10-04 14:24:21 -0500261 )
262 return _r
263
Alex90ac1532021-12-09 11:13:14 -0600264 def _safe_tools_cmd_zipped_output(self, cmd_str):
265 # temp file
266 _tmp_path = "/tmp"
267 _filename = "checker_cmd_output"
268 _tar_path = os.path.join(_tmp_path, "checker_cmd.tgz")
269 _path = os.path.join(_tmp_path, _filename)
270
271 # Run original cmd with redirect
272 _cmd = [cmd_str, "-o", _path]
273 self._safe_tools_cmd(" ".join(_cmd), expect_output=False)
274 # zip it and base64 encode
275 _cmd = ["tar", "-zcvf", _tar_path, _path]
276 self._safe_tools_cmd(" ".join(_cmd))
277 _b64 = self._safe_tools_cmd("base64 " + _tar_path)
278 # decode and decompress
279 _io = io.BytesIO(base64.standard_b64decode(_b64))
280 _json = ""
281 with tarfile.open(fileobj=_io) as _tar:
282 _tar_item = _tar.extractfile(_tar.getmembers()[0])
283 _json = _tar_item.read()
284 # cleanup
285 self._safe_tools_cmd("rm -f " + _path)
286 self._safe_tools_cmd("rm -f " + _tar_path)
287 return _json
288
289 def _safe_get_cmd_output_as_json(self, cmd, zipped=False):
290 if zipped:
291 _buf = self._safe_tools_cmd_zipped_output(cmd)
292 else:
293 _buf = self._safe_tools_cmd(cmd)
Alexdcb792f2021-10-04 14:24:21 -0500294 try:
295 return json.loads(_buf)
Alex90ac1532021-12-09 11:13:14 -0600296 except ValueError as e:
297 _out = ""
298 if len(_buf) > 512:
299 _out = _buf[:512]
300 _out += "..."
301 else:
302 _out = _buf
Alexdcb792f2021-10-04 14:24:21 -0500303 logger_cli.error(
Alex90ac1532021-12-09 11:13:14 -0600304 "\nERROR: failed to parse json: '{}'. Data: '{}'".format(
305 e,
306 _out
307 )
Alexdcb792f2021-10-04 14:24:21 -0500308 )
309 return _buf
310
311 def _get_tools_pod_name(self):
312 # get ceph pod
313 _names = self.master.kube.get_pod_names_by_partial_name(
314 self.ceph_app_label,
315 self.ceph_ns
316 )
317 if not _names:
318 raise KubeException(
319 "Failed to find pod using '{}'".format(self.ceph_app_label)
320 )
321 elif len(_names) > 1:
322 logger_cli.warning(
323 "WARNING: Environment has more than one pod "
324 "with '{}' app: {}".format(
325 self.ceph_app_label,
326 ", ".join(_names)
327 )
328 )
329 else:
330 logger_cli.debug("... found '{}'".format(_names[0]))
331 return _names[0]
332
333 def _add_ceph_info_item(self, key, title, data):
334 if key in self.ceph_info:
335 self.ceph_info[key]["title"] = title
336 self.ceph_info[key]["data"] = data
337 else:
338 self.ceph_info[key] = {
339 "title": title,
340 "data": data
341 }
342
343 def _parse_dev_classes(self, deviceClasses):
344 _devClasses = []
345 for _i in deviceClasses:
346 _devClasses += list(_i.values())
347 return set(_devClasses)
348
349 def get_ceph_cluster_config(self):
350 # get cephclusters resource
351 logger_cli.info("# Loading '{}' object of type '{}/{}'".format(
352 self.ceph_plural,
353 self.ceph_group,
354 self.ceph_apiversion
355 ))
356 _r = self.master.kube.get_custom_resource(
357 self.ceph_group,
358 self.ceph_apiversion,
359 self.ceph_plural,
360 )
361 # find cluster
362 _cluster = None
363 if len(_r['items']) < 1:
364 logger_cli.warning(
365 "WARNING: Failed to find '{}' ({}/{})".format(
366 self.ceph_plural,
367 self.ceph_group,
368 self.ceph_apiversion
369 )
370 )
371 return 'uknown'
372 elif len(_r['items']) > 1:
373 logger_cli.warning(
374 "WARNING: Multiple clusters found '{}' ({}/{})".format(
375 self.ceph_plural,
376 self.ceph_group,
377 self.ceph_apiversion
378 )
379 )
380 _cluster = _r['items'][0]
381 _s = _cluster['status']
382 self.cluster_info.update({
383 'image': _s['version']['image'],
384 'version': _s['version']['version'],
385 'device_classes': self._parse_dev_classes(
386 _s['storage']['deviceClasses']
387 ),
388 'phase': _s['phase'],
389 'state': _s['state'],
390 'health': _s['ceph']['health'],
391 'previousHealth': _s['ceph']['previousHealth'],
392 'lastChanged': _s['ceph']['lastChanged'],
393 'lastChecked': _s['ceph']['lastChecked'],
394 'mon_count': _cluster['spec']['mon']['count']
395 })
396 self.nodes = _cluster['spec']['storage']['nodes'],
397 logger_cli.info("-> Found Ceph cluster: {} ({})".format(
398 self.cluster_info['version'],
399 self.cluster_info['image']
400 ))
401 return self.cluster_info['version']
402
Alexb2129542021-11-23 15:49:42 -0600403 def get_cluster_status(self):
404 return self._safe_get_cmd_output_as_json("ceph -s -f json")
405
406 def get_health_detail(self):
407 return self._safe_get_cmd_output_as_json("ceph -f json health detail")
408
409 def get_ceph_df(self):
410 return self._safe_get_cmd_output_as_json("ceph df -f json")
411
412 def get_ceph_pg_dump(self):
Alex90ac1532021-12-09 11:13:14 -0600413 return self._safe_get_cmd_output_as_json(
414 "ceph pg dump -f json",
415 zipped=True
416 )
Alexb2129542021-11-23 15:49:42 -0600417
418 def get_ceph_osd_df(self):
419 return self._safe_get_cmd_output_as_json("ceph osd df -f json")
420
Alexdcb792f2021-10-04 14:24:21 -0500421 def gather_info(self):
422 logger_cli.info("# Gathering Ceph cluster info")
423 # Collect info
424 _c = self._safe_tools_cmd
425 _cj = self._safe_get_cmd_output_as_json
426 # Crush Map
427 logger_cli.info("-> Collecting CRUSH map")
428 _cmap_tmp_path = "/tmp/crushmap.bin"
429 _r = _c(
430 "ceph osd getcrushmap -o " + _cmap_tmp_path,
431 expect_output=False
432 )
433 # TODO: Handle errors in _r
434 logger_cli.debug("... 'getcrushmap' return value is: '{}'".format(_r))
435
436 # Get Crush map as json and text
437 self._add_ceph_info_item(
438 "crushmap_json",
439 "Crush Map (json)",
440 _cj("crushtool -i " + _cmap_tmp_path + " --dump")
441 )
442 # _crushmap = _cj("crushtool -i " + _cmap_tmp_path + " --dump")
443 self._add_ceph_info_item(
444 "crushmap_text",
445 "Crush Map (text)",
446 _c("crushtool -d " + _cmap_tmp_path)
447 )
448
449 logger_cli.info("-> Collecting ceph osd crush dump")
450 self._add_ceph_info_item(
451 "osd_crushdump",
452 "Crush dump (osd)",
453 _cj("ceph osd crush dump")
454 )
455
456 logger_cli.info("-> Collecting cluster status")
457 self._add_ceph_info_item(
458 "cluster_status",
459 "Cluster status",
Alexb2129542021-11-23 15:49:42 -0600460 self.get_cluster_status()
Alexdcb792f2021-10-04 14:24:21 -0500461 )
462
463 logger_cli.info("-> Collecting health detail")
464 self._add_ceph_info_item(
465 "health_detail",
466 "Health details",
Alexb2129542021-11-23 15:49:42 -0600467 self.get_health_detail()
Alexdcb792f2021-10-04 14:24:21 -0500468 )
469
470 logger_cli.info("-> Collecting monmap")
471 self._add_ceph_info_item(
472 "monmap",
473 "Ceph Mon map",
474 _cj("ceph mon dump -f json")
475 )
476
477 logger_cli.info("-> Collecting ceph df")
478 self._add_ceph_info_item(
479 "ceph_df",
480 "Ceph DF",
Alexb2129542021-11-23 15:49:42 -0600481 self.get_ceph_df()
Alexdcb792f2021-10-04 14:24:21 -0500482 )
483
484 logger_cli.info("-> Collecting ceph osd df")
485 self._add_ceph_info_item(
486 "ceph_osd_df",
487 "Ceph OSD DF",
Alexb2129542021-11-23 15:49:42 -0600488 self.get_ceph_osd_df()
Alexdcb792f2021-10-04 14:24:21 -0500489 )
490
491 logger_cli.info("-> Collecting ceph osd dump")
492 self._add_ceph_info_item(
493 "ceph_osd_dump",
494 "Ceph OSD dump",
495 _cj("ceph osd dump -f json")
496 )
497
498 logger_cli.info("-> Collecting rados df")
499 self._add_ceph_info_item(
500 "rados_df",
501 "Rados DF",
502 _cj("rados df -f json")
503 )
504
505 logger_cli.info("-> Collecting ceph report")
506 self._add_ceph_info_item(
507 "ceph_report",
508 "Ceph Report",
509 _cj("ceph report")
510 )
511
512 logger_cli.info("-> Collecting auth data anonymized")
513 _auth_data = _cj("ceph auth list -f json")
514 # Anonymize data
515 # _cj("ceph auth list -f json | sed 's/AQ[^=]*==/KEY/g'")
516 for item in _auth_data["auth_dump"]:
517 if "key" in item:
518 item['key'] = "key-data-redacted"
519 self._add_ceph_info_item(
520 "ceph_auth_ls",
521 "Ceph Auth Data (anonymized)",
522 _auth_data
523 )
524
525 logger_cli.info("-> Collecting ceph pg dump")
526 self._add_ceph_info_item(
527 "ceph_pg_dump",
528 "Ceph PG dump",
Alexb2129542021-11-23 15:49:42 -0600529 self.get_ceph_pg_dump()
Alexdcb792f2021-10-04 14:24:21 -0500530 )
531
532 logger_cli.info("-> Collecting ceph running configuration")
533 self._add_ceph_info_item(
534 "ceph_config_dump",
535 "Ceph Configuration Dump",
536 _cj("ceph config dump -f json")
537 )
538
539 logger_cli.info("-> Collecting health metrics")
540 _health_metrics = {}
541 _devices = _c("ceph device ls")
Alexdf9cc3a2021-10-12 14:37:28 -0500542 _devices = _devices.splitlines()
543 _progress = Progress(len(_devices)-1)
544 _index = 1
545 for device in _devices:
Alexdcb792f2021-10-04 14:24:21 -0500546 _t = device.split()
547 _osd = _t[2]
Alexdf9cc3a2021-10-12 14:37:28 -0500548 _node = _t[1]
Alexdcb792f2021-10-04 14:24:21 -0500549 _dev = _t[0]
550 if _dev == "DEVICE":
551 continue
552 _metric = _cj("ceph device get-health-metrics {}".format(_dev))
Alexdf9cc3a2021-10-12 14:37:28 -0500553 _dev_name = "{}_{}".format(_osd, _dev)
554 _health_metrics[_dev_name] = _metric
555 _health_metrics[_dev_name]['node_name'] = _node
556 _health_metrics[_dev_name]['osd_name'] = _osd
557 _progress.write_progress(_index, note=_dev_name)
558 _index += 1
559 _progress.end()
Alexdcb792f2021-10-04 14:24:21 -0500560 self._add_ceph_info_item(
561 "ceph_health",
562 "Ceph Health Metrics",
563 _health_metrics
564 )
565
566 # Latency values
567 # config const for set
568 _latency_count = 10
569 _latency_delay = 4
570 logger_cli.info(
571 "-> Collecting ceph osd latency data "
572 "({} total, {} sec delay)".format(
573 _latency_count,
574 _latency_delay
575 )
576 )
577 _osd_lat = {
578 "total": _latency_count,
579 "delay": _latency_delay,
580 "data": []
581 }
582 _progress = Progress(_latency_count)
583 _index = 1
584 while _index <= _latency_count:
585 _progress.write_progress(_index)
586 _osd_lat["data"].append(_cj("ceph osd perf -f json"))
587 sleep(_latency_delay)
588 _index += 1
589 _progress.end()
590 self._add_ceph_info_item(
591 "osd_latency_data",
592 "OSD Latency metrics",
593 _osd_lat
594 )
595
596 return