blob: db3dd75544e3282d6894913a50760a19c8a371c0 [file] [log] [blame]
Alex0989ecf2022-03-29 13:43:21 -05001# Author: Alex Savatieiev (osavatieiev@mirantis.com; a.savex@gmail.com)
2# Copyright 2019-2022 Mirantis, Inc.
Alex90ac1532021-12-09 11:13:14 -06003import base64
Alexdcb792f2021-10-04 14:24:21 -05004import json
Alex90ac1532021-12-09 11:13:14 -06005import os
6import tarfile
7import io
Alexdcb792f2021-10-04 14:24:21 -05008from time import sleep
Alexeb934de2022-10-06 13:49:30 -05009from datetime import datetime
Alexdcb792f2021-10-04 14:24:21 -050010
Alexe4de1142022-11-04 19:26:03 -050011from cfg_checker.common import logger_cli, logger
Alexdcb792f2021-10-04 14:24:21 -050012from cfg_checker.common.exception import KubeException
13
14from cfg_checker.helpers.console_utils import Progress
15from cfg_checker.helpers.tgz import TGZFile
16from cfg_checker.nodes import KubeNodes
17from cfg_checker.reports import reporter
18
19
20class CephInfo(object):
21 def __init__(
22 self,
23 config
24 ):
25 self.env_config = config
26 return
27
Alexeb934de2022-10-06 13:49:30 -050028 def get_info_archive_filename(self, client, project):
29 # prefill known data
30 _tags = ["CephCollectData"]
31 _tags.append(client)
32 _tags.append(project)
33
34 # generate date for tgz
35 _file_datetime_fmt = "%Y-%m-%d"
36 _dt = datetime.now().strftime(_file_datetime_fmt)
37 _tags.append(_dt)
38
39 # extension
40 _tags.append("tar")
41 _tags.append("gz")
42 return ".".join(_tags)
43
Alexdcb792f2021-10-04 14:24:21 -050044 def get_transposed_latency_table(self):
45 _table = {
46 "<dev>": []
47 }
48 for _pfd in self.ceph_info['osd_latency_data']['data']['data']:
49 _table["<dev>"].append({
50 "formatted": " cL/aL ",
51 "commit_latency_ms": "Commit, ms",
52 "apply_latency_ms": "Apply, ms",
53 "commit_latency_ns": "Commit, ns",
54 "apply_latency_ns": "Apply, ns"
55 })
56 for _f in _pfd['osdstats']['osd_perf_infos']:
57 _n = "osd_{}".format(_f['id'])
58 if _n not in _table:
59 _table[_n] = []
60 _table[_n].append({
61 "formatted": "{:>3}/{:<3}".format(
62 _f['perf_stats']['commit_latency_ms'],
63 _f['perf_stats']['apply_latency_ms'],
64 ),
65 "commit_latency_ms": _f['perf_stats']['commit_latency_ms'],
66 "apply_latency_ms": _f['perf_stats']['apply_latency_ms'],
67 "commit_latency_ns": _f['perf_stats']['commit_latency_ns'],
68 "apply_latency_ns": _f['perf_stats']['apply_latency_ns']
69 })
70 self.ceph_info['osd_latency_data']['table'] = _table
71 return _table
72
73 def get_latest_health_readout(self):
74 _h = self.ceph_info['ceph_health']['data']
75 self.ceph_info['ceph_health']['latest'] = {}
76 for _n, _d in _h.items():
77 if not _d:
78 self.ceph_info['ceph_health']['latest'][_n] = {}
79 continue
80 else:
81 # TODO: Consider filtering out or prepare data for the table
Alexdefbfeb2022-11-08 12:17:54 -060082 _osd = _d.pop("osd_name") if "osd_name" in _d else "unknown"
83 _node_name = _d.pop("node_name") \
84 if "node_name" in _d else "unknown"
Alex90ac1532021-12-09 11:13:14 -060085 # Additional check for empty data
86 if not _d:
87 self.ceph_info['ceph_health']['latest'][_n] = {}
88 continue
Alexdcb792f2021-10-04 14:24:21 -050089 _date = sorted(_d.keys(), reverse=True)[0]
90 self.ceph_info['ceph_health']['date'] = _date
91 self.ceph_info['ceph_health']['latest'][_n] = _d[_date]
Alexdf9cc3a2021-10-12 14:37:28 -050092 self.ceph_info['ceph_health']['latest'][_n]["osd_name"] = _osd
93 self.ceph_info['ceph_health']['latest'][_n]["node_name"] = \
94 _node_name
Alexdcb792f2021-10-04 14:24:21 -050095
96 return self.ceph_info['ceph_health']['latest']
97
98 def print_summary(self):
99 logger_cli.info("\n# Ceph Cluster summary")
100 # Health status
101 _h = self.ceph_info['health_detail']['data']
102 logger_cli.info("Cluster status: {}".format(_h['status']))
103 for _chk, _d in _h['checks'].items():
104 logger_cli.info(
105 "+ {}: {}\n\tSummary: {}".format(
106 _chk,
107 _d['severity'],
108 _d['summary']['message']
109 )
110 )
111 logger_cli.info("\tDetails:")
112 for _item in _d['detail']:
113 logger_cli.info("\t '{}".format(_item['message']))
114
115 # OSD health metrics
116 logger_cli.info("\n# Device health metrics:")
117 _fmt = " {:45} {:^14} {:^9} {:^6} {:^6}"
118 logger_cli.info(
119 _fmt.format(
120 "Device Name",
121 "Info",
122 "Speed",
123 "SMART",
124 "Tempr."
125 )
126 )
127 _latest = self.get_latest_health_readout()
128 for _n, _d in _latest.items():
129 if not _d:
130 logger_cli.info("{:45} {:<10}".format(_n, "<empty>"))
131 continue
132
Alexdf9cc3a2021-10-12 14:37:28 -0500133 _status = _d['smart_status']['passed']
134 if "interface_speed" in _d:
135 _speed = _d['interface_speed']['current']['string']
136 else:
137 _speed = "-"
Alexdcb792f2021-10-04 14:24:21 -0500138
139 _status = 'passed' if _status else 'failed'
140 logger_cli.info(
141 _fmt.format(
142 _n,
143 _d['device']['info_name'],
Alexdf9cc3a2021-10-12 14:37:28 -0500144 _speed,
Alexdcb792f2021-10-04 14:24:21 -0500145 _status,
146 _d['temperature']['current']
147 )
148 )
149
150 # Latency table
151 logger_cli.info(
152 "\n# OSD Latency data ({} iterations, {} sec delay), "
153 "table items 'osd_dev: N:cL/aL'\n"
154 " 'Commit Latency' -> 'cL', 'Apply Latency' -> 'aL'\n".format(
155 self.ceph_info['osd_latency_data']['data']['total'],
156 self.ceph_info['osd_latency_data']['data']['delay']
157 )
158 )
159 _strs = self.get_transposed_latency_table()
160 for _osd, _list in _strs.items():
161 _row = [c["formatted"] for c in _list]
162 logger_cli.info(
163 " {:8}: {}".format(
164 _osd,
165 " ".join(_row)
166 )
167 )
168 logger_cli.info("\n")
169
170 # critical config values
171 # TODO: print/calculate config values
172
173 return
174
175 def dump_info(self):
176 with open('cephdump.json', 'wt') as _f:
177 _f.write(json.dumps(self.ceph_info, indent=2))
178
179 def load_info(self):
180 with open('cephdump.json', 'rt') as _f:
181 self.ceph_info = json.load(_f)
182
183 def generate_archive(self, tgzfilename):
Alexeb934de2022-10-06 13:49:30 -0500184 def _ensure_fname(ext):
185 return key + ext if _fname is None else _fname
186
Alexdcb792f2021-10-04 14:24:21 -0500187 if not self.ceph_info:
188 logger_cli.warning(
189 "WARNING: Ceph Info Data not detected. "
190 "Consider check for errors in log."
191 )
192 else:
193 # Create Archive
194 logger_cli.info("-> Generating archive '{}'".format(tgzfilename))
195 _tgz = TGZFile(
196 tgzfilename,
197 label="MCP Checker: Generated Ceph Information"
198 )
199 # Iterate every key and write data to tar file
200 for key, d in self.ceph_info.items():
Alexeb934de2022-10-06 13:49:30 -0500201 _fname = None
Alexdcb792f2021-10-04 14:24:21 -0500202 # Cast buf to a proper type
203 _buf = None
Alexeb934de2022-10-06 13:49:30 -0500204 if "filename" in d:
205 _fname = d["filename"]
Alexdcb792f2021-10-04 14:24:21 -0500206 if isinstance(d["data"], dict) or isinstance(d["data"], list):
207 _buf = json.dumps(d["data"], indent=2)
Alexe4de1142022-11-04 19:26:03 -0500208 # _filename = key+".json" if _fname is not None else _fname
Alexeb934de2022-10-06 13:49:30 -0500209 _filename = _ensure_fname(".json")
Alexdcb792f2021-10-04 14:24:21 -0500210 elif isinstance(d["data"], str):
211 _buf = d["data"]
Alexe4de1142022-11-04 19:26:03 -0500212 # _filename = key+".txt"
Alexeb934de2022-10-06 13:49:30 -0500213 _filename = _ensure_fname(".txt")
Alexdcb792f2021-10-04 14:24:21 -0500214 else:
215 _buf = str(d["data"])
Alexe4de1142022-11-04 19:26:03 -0500216 # _filename = key+".txt"
Alexeb934de2022-10-06 13:49:30 -0500217 _filename = _ensure_fname(".txt")
Alexdcb792f2021-10-04 14:24:21 -0500218 logger_cli.debug("... writing '{}'".format(_filename))
219 _tgz.add_file(_filename, buf=_buf, replace=True)
220
221 return
222
223 def create_html_report(self, filename):
224 """
225 Create static html showing ceph info report
226
227 :return: none
228 """
229 logger_cli.info("### Generating report to '{}'".format(filename))
230 _report = reporter.ReportToFile(
231 reporter.HTMLCephInfo(self),
232 filename
233 )
234 _report(
235 {
236 "info": self.ceph_info,
237 "cluster": self.cluster_info,
238 "nodes": self.nodes,
239 "ceph_version": self.ceph_version,
240 }
241 )
242 logger_cli.info("-> Done")
243
244 return
245
246
247class SaltCephInfo(CephInfo):
248 def __init__(
249 self,
250 config
251 ):
252 logger_cli.warning("\nWARNING: Not impelented for Salt environment!\n")
253
254 # self.master = SaltNodes(config)
255 super(SaltCephInfo, self).__init__(config)
256 return
257
258
259class KubeCephInfo(CephInfo):
260 ceph_ns = "rook-ceph"
261 ceph_app_label = "rook-ceph-tools"
262 ceph_group = "ceph.rook.io"
263 ceph_apiversion = "v1"
264 ceph_plural = "cephclusters"
265 ceph_version = "unknown"
266
267 def __init__(self, config):
268 self.master = KubeNodes(config)
269 super(KubeCephInfo, self).__init__(config)
270 # Init ceph tools pod
271 self.pod_name = self._get_tools_pod_name()
272 self.ceph_info = {}
273 self.cluster_info = {}
274 self.ceph_version = self.get_ceph_cluster_config()
275
Alex90ac1532021-12-09 11:13:14 -0600276 def _safe_tools_cmd(self, cmd_str, expect_output=True):
Alexdcb792f2021-10-04 14:24:21 -0500277 _r = self.master.exec_cmd_on_target_pod(
278 self.pod_name,
279 self.ceph_ns,
Alex90ac1532021-12-09 11:13:14 -0600280 cmd_str
Alexdcb792f2021-10-04 14:24:21 -0500281 )
282 if expect_output and not _r:
Alexe4de1142022-11-04 19:26:03 -0500283 logger.debug("... got empty output for '{}'".format(cmd_str))
Alexdcb792f2021-10-04 14:24:21 -0500284 elif not expect_output and _r:
Alexe4de1142022-11-04 19:26:03 -0500285 logger.warning(
Alexdcb792f2021-10-04 14:24:21 -0500286 "WARNING: Unexpected output for '{}':\n"
Alex90ac1532021-12-09 11:13:14 -0600287 "===== Start\n{}\n===== End".format(cmd_str, _r)
Alexdcb792f2021-10-04 14:24:21 -0500288 )
289 return _r
290
Alex90ac1532021-12-09 11:13:14 -0600291 def _safe_tools_cmd_zipped_output(self, cmd_str):
292 # temp file
293 _tmp_path = "/tmp"
294 _filename = "checker_cmd_output"
295 _tar_path = os.path.join(_tmp_path, "checker_cmd.tgz")
296 _path = os.path.join(_tmp_path, _filename)
297
298 # Run original cmd with redirect
299 _cmd = [cmd_str, "-o", _path]
300 self._safe_tools_cmd(" ".join(_cmd), expect_output=False)
301 # zip it and base64 encode
302 _cmd = ["tar", "-zcvf", _tar_path, _path]
303 self._safe_tools_cmd(" ".join(_cmd))
304 _b64 = self._safe_tools_cmd("base64 " + _tar_path)
305 # decode and decompress
306 _io = io.BytesIO(base64.standard_b64decode(_b64))
307 _json = ""
308 with tarfile.open(fileobj=_io) as _tar:
309 _tar_item = _tar.extractfile(_tar.getmembers()[0])
310 _json = _tar_item.read()
311 # cleanup
312 self._safe_tools_cmd("rm -f " + _path)
313 self._safe_tools_cmd("rm -f " + _tar_path)
314 return _json
315
Alex0bcf31b2022-03-29 17:38:58 -0500316 @staticmethod
317 def _as_json(buf):
Alexdcb792f2021-10-04 14:24:21 -0500318 try:
Alex0bcf31b2022-03-29 17:38:58 -0500319 return json.loads(buf)
Alex90ac1532021-12-09 11:13:14 -0600320 except ValueError as e:
321 _out = ""
Alex0bcf31b2022-03-29 17:38:58 -0500322 if len(buf) > 512:
323 _out = buf[:512]
Alex90ac1532021-12-09 11:13:14 -0600324 _out += "..."
325 else:
Alex0bcf31b2022-03-29 17:38:58 -0500326 _out = buf
Alexdcb792f2021-10-04 14:24:21 -0500327 logger_cli.error(
Alex90ac1532021-12-09 11:13:14 -0600328 "\nERROR: failed to parse json: '{}'. Data: '{}'".format(
329 e,
330 _out
331 )
Alexdcb792f2021-10-04 14:24:21 -0500332 )
Alex0bcf31b2022-03-29 17:38:58 -0500333 return buf
334
335 def _safe_get_cmd_output_as_json(self, cmd, zipped=False):
336 if zipped:
337 _buf = self._safe_tools_cmd_zipped_output(cmd)
338 else:
339 _buf = self._safe_tools_cmd(cmd)
340 return self._as_json(_buf)
Alexdcb792f2021-10-04 14:24:21 -0500341
342 def _get_tools_pod_name(self):
343 # get ceph pod
Alex0bcf31b2022-03-29 17:38:58 -0500344 _pods = self.master.kube.get_pods_by_partial_name(
Alexdcb792f2021-10-04 14:24:21 -0500345 self.ceph_app_label,
346 self.ceph_ns
347 )
Alex0bcf31b2022-03-29 17:38:58 -0500348 # _names = self.master.kube.get_pod_names_by_partial_name(
349 # self.ceph_app_label,
350 # self.ceph_ns
351 # )
352 if not _pods:
Alexdcb792f2021-10-04 14:24:21 -0500353 raise KubeException(
354 "Failed to find pod using '{}'".format(self.ceph_app_label)
355 )
Alex0bcf31b2022-03-29 17:38:58 -0500356 elif len(_pods) > 1:
Alexdcb792f2021-10-04 14:24:21 -0500357 logger_cli.warning(
358 "WARNING: Environment has more than one pod "
359 "with '{}' app: {}".format(
360 self.ceph_app_label,
Alex0bcf31b2022-03-29 17:38:58 -0500361 ", ".join([p.metadata.name for p in _pods])
Alexdcb792f2021-10-04 14:24:21 -0500362 )
363 )
364 else:
Alex0bcf31b2022-03-29 17:38:58 -0500365 logger_cli.debug("... found '{}'".format(_pods[0].metadata.name))
366 self.ceph_pod = _pods[0]
367 return _pods[0].metadata.name
Alexdcb792f2021-10-04 14:24:21 -0500368
Alexeb934de2022-10-06 13:49:30 -0500369 def _add_ceph_info_item(self, key, title, data, filename=None):
370 # handle data
Alexdcb792f2021-10-04 14:24:21 -0500371 if key in self.ceph_info:
372 self.ceph_info[key]["title"] = title
373 self.ceph_info[key]["data"] = data
374 else:
375 self.ceph_info[key] = {
376 "title": title,
377 "data": data
378 }
Alexeb934de2022-10-06 13:49:30 -0500379 if filename:
380 self.ceph_info[key]["filename"] = filename
Alexdcb792f2021-10-04 14:24:21 -0500381
382 def _parse_dev_classes(self, deviceClasses):
383 _devClasses = []
384 for _i in deviceClasses:
385 _devClasses += list(_i.values())
386 return set(_devClasses)
387
388 def get_ceph_cluster_config(self):
389 # get cephclusters resource
390 logger_cli.info("# Loading '{}' object of type '{}/{}'".format(
391 self.ceph_plural,
392 self.ceph_group,
393 self.ceph_apiversion
394 ))
395 _r = self.master.kube.get_custom_resource(
396 self.ceph_group,
397 self.ceph_apiversion,
398 self.ceph_plural,
399 )
400 # find cluster
401 _cluster = None
402 if len(_r['items']) < 1:
403 logger_cli.warning(
404 "WARNING: Failed to find '{}' ({}/{})".format(
405 self.ceph_plural,
406 self.ceph_group,
407 self.ceph_apiversion
408 )
409 )
410 return 'uknown'
411 elif len(_r['items']) > 1:
412 logger_cli.warning(
413 "WARNING: Multiple clusters found '{}' ({}/{})".format(
414 self.ceph_plural,
415 self.ceph_group,
416 self.ceph_apiversion
417 )
418 )
419 _cluster = _r['items'][0]
420 _s = _cluster['status']
421 self.cluster_info.update({
422 'image': _s['version']['image'],
423 'version': _s['version']['version'],
424 'device_classes': self._parse_dev_classes(
Alex0f9b2652022-10-20 13:50:47 -0500425 _s['storage'].get('deviceClasses', [])
Alexdcb792f2021-10-04 14:24:21 -0500426 ),
427 'phase': _s['phase'],
428 'state': _s['state'],
Alex0f9b2652022-10-20 13:50:47 -0500429 'health': _s['ceph'].get('health', {}),
430 'previousHealth': _s['ceph'].get('previousHealth', {}),
431 'lastChanged': _s['ceph'].get('lastChanged', ""),
432 'lastChecked': _s['ceph'].get('lastChecked', ""),
Alexdcb792f2021-10-04 14:24:21 -0500433 'mon_count': _cluster['spec']['mon']['count']
434 })
435 self.nodes = _cluster['spec']['storage']['nodes'],
436 logger_cli.info("-> Found Ceph cluster: {} ({})".format(
437 self.cluster_info['version'],
438 self.cluster_info['image']
439 ))
440 return self.cluster_info['version']
441
Alexb2129542021-11-23 15:49:42 -0600442 def get_cluster_status(self):
443 return self._safe_get_cmd_output_as_json("ceph -s -f json")
444
445 def get_health_detail(self):
446 return self._safe_get_cmd_output_as_json("ceph -f json health detail")
447
448 def get_ceph_df(self):
449 return self._safe_get_cmd_output_as_json("ceph df -f json")
450
451 def get_ceph_pg_dump(self):
Alex90ac1532021-12-09 11:13:14 -0600452 return self._safe_get_cmd_output_as_json(
453 "ceph pg dump -f json",
454 zipped=True
455 )
Alexb2129542021-11-23 15:49:42 -0600456
457 def get_ceph_osd_df(self):
458 return self._safe_get_cmd_output_as_json("ceph osd df -f json")
459
Alexdcb792f2021-10-04 14:24:21 -0500460 def gather_info(self):
461 logger_cli.info("# Gathering Ceph cluster info")
462 # Collect info
463 _c = self._safe_tools_cmd
464 _cj = self._safe_get_cmd_output_as_json
465 # Crush Map
466 logger_cli.info("-> Collecting CRUSH map")
467 _cmap_tmp_path = "/tmp/crushmap.bin"
468 _r = _c(
469 "ceph osd getcrushmap -o " + _cmap_tmp_path,
470 expect_output=False
471 )
472 # TODO: Handle errors in _r
473 logger_cli.debug("... 'getcrushmap' return value is: '{}'".format(_r))
474
475 # Get Crush map as json and text
476 self._add_ceph_info_item(
477 "crushmap_json",
478 "Crush Map (json)",
Alexeb934de2022-10-06 13:49:30 -0500479 _cj("crushtool -i " + _cmap_tmp_path + " --dump"),
480 filename="crushmap.json"
Alexdcb792f2021-10-04 14:24:21 -0500481 )
482 # _crushmap = _cj("crushtool -i " + _cmap_tmp_path + " --dump")
483 self._add_ceph_info_item(
484 "crushmap_text",
485 "Crush Map (text)",
Alexeb934de2022-10-06 13:49:30 -0500486 _c("crushtool -d " + _cmap_tmp_path),
487 filename="crushmap.json"
Alexdcb792f2021-10-04 14:24:21 -0500488 )
489
490 logger_cli.info("-> Collecting ceph osd crush dump")
491 self._add_ceph_info_item(
492 "osd_crushdump",
493 "Crush dump (osd)",
494 _cj("ceph osd crush dump")
495 )
496
497 logger_cli.info("-> Collecting cluster status")
498 self._add_ceph_info_item(
499 "cluster_status",
500 "Cluster status",
Alexb2129542021-11-23 15:49:42 -0600501 self.get_cluster_status()
Alexdcb792f2021-10-04 14:24:21 -0500502 )
503
504 logger_cli.info("-> Collecting health detail")
505 self._add_ceph_info_item(
506 "health_detail",
507 "Health details",
Alexb2129542021-11-23 15:49:42 -0600508 self.get_health_detail()
Alexdcb792f2021-10-04 14:24:21 -0500509 )
510
511 logger_cli.info("-> Collecting monmap")
512 self._add_ceph_info_item(
513 "monmap",
514 "Ceph Mon map",
515 _cj("ceph mon dump -f json")
516 )
517
518 logger_cli.info("-> Collecting ceph df")
519 self._add_ceph_info_item(
520 "ceph_df",
521 "Ceph DF",
Alexb2129542021-11-23 15:49:42 -0600522 self.get_ceph_df()
Alexdcb792f2021-10-04 14:24:21 -0500523 )
524
525 logger_cli.info("-> Collecting ceph osd df")
526 self._add_ceph_info_item(
527 "ceph_osd_df",
528 "Ceph OSD DF",
Alexb2129542021-11-23 15:49:42 -0600529 self.get_ceph_osd_df()
Alexdcb792f2021-10-04 14:24:21 -0500530 )
531
532 logger_cli.info("-> Collecting ceph osd dump")
533 self._add_ceph_info_item(
534 "ceph_osd_dump",
535 "Ceph OSD dump",
536 _cj("ceph osd dump -f json")
537 )
538
539 logger_cli.info("-> Collecting rados df")
540 self._add_ceph_info_item(
541 "rados_df",
542 "Rados DF",
543 _cj("rados df -f json")
544 )
545
546 logger_cli.info("-> Collecting ceph report")
547 self._add_ceph_info_item(
548 "ceph_report",
549 "Ceph Report",
550 _cj("ceph report")
551 )
552
553 logger_cli.info("-> Collecting auth data anonymized")
554 _auth_data = _cj("ceph auth list -f json")
555 # Anonymize data
556 # _cj("ceph auth list -f json | sed 's/AQ[^=]*==/KEY/g'")
557 for item in _auth_data["auth_dump"]:
558 if "key" in item:
559 item['key'] = "key-data-redacted"
560 self._add_ceph_info_item(
561 "ceph_auth_ls",
562 "Ceph Auth Data (anonymized)",
563 _auth_data
564 )
565
566 logger_cli.info("-> Collecting ceph pg dump")
567 self._add_ceph_info_item(
568 "ceph_pg_dump",
569 "Ceph PG dump",
Alexb2129542021-11-23 15:49:42 -0600570 self.get_ceph_pg_dump()
Alexdcb792f2021-10-04 14:24:21 -0500571 )
572
573 logger_cli.info("-> Collecting ceph running configuration")
574 self._add_ceph_info_item(
575 "ceph_config_dump",
576 "Ceph Configuration Dump",
577 _cj("ceph config dump -f json")
578 )
579
580 logger_cli.info("-> Collecting health metrics")
581 _health_metrics = {}
582 _devices = _c("ceph device ls")
Alexdf9cc3a2021-10-12 14:37:28 -0500583 _devices = _devices.splitlines()
Alex0bcf31b2022-03-29 17:38:58 -0500584 cmd_list = []
Alexdf9cc3a2021-10-12 14:37:28 -0500585 for device in _devices:
Alexdcb792f2021-10-04 14:24:21 -0500586 _t = device.split()
Alexdcb792f2021-10-04 14:24:21 -0500587 _dev = _t[0]
Alex336697e2023-05-22 16:49:32 -0500588 _node = _t[1] if len(_t) > 1 else "unknown"
589 _osd = _t[2] if len(_t) > 2 else "unknown"
590
Alexdcb792f2021-10-04 14:24:21 -0500591 if _dev == "DEVICE":
592 continue
Alex0bcf31b2022-03-29 17:38:58 -0500593 # _metric = _cj("ceph device get-health-metrics {}".format(_dev))
594 _cmd = "ceph device get-health-metrics {}".format(_dev)
595 cmd_list.append(_cmd)
Alexdf9cc3a2021-10-12 14:37:28 -0500596 _dev_name = "{}_{}".format(_osd, _dev)
Alex0bcf31b2022-03-29 17:38:58 -0500597 _health_metrics[_dev_name] = {}
Alexdf9cc3a2021-10-12 14:37:28 -0500598 _health_metrics[_dev_name]['node_name'] = _node
599 _health_metrics[_dev_name]['osd_name'] = _osd
Alex0bcf31b2022-03-29 17:38:58 -0500600 _health_metrics[_dev_name]['cmd'] = _cmd
601
602 results = self.master.exec_cmds_on_pod(
603 self.ceph_pod,
604 cmd_list
605 )
606
607 logger_cli.info("-> Processing results")
608 for _r in results:
609 _cmd = _r[3]
610 _j = self._as_json(_r[2])
611 for _dev_name in _health_metrics.keys():
612 if "cmd" in _health_metrics[_dev_name] and \
613 _health_metrics[_dev_name]["cmd"] == _cmd:
614 _health_metrics[_dev_name].update(_j)
615 _health_metrics[_dev_name].pop("cmd")
616 break
617
Alexdcb792f2021-10-04 14:24:21 -0500618 self._add_ceph_info_item(
619 "ceph_health",
620 "Ceph Health Metrics",
621 _health_metrics
622 )
623
624 # Latency values
625 # config const for set
626 _latency_count = 10
627 _latency_delay = 4
628 logger_cli.info(
629 "-> Collecting ceph osd latency data "
630 "({} total, {} sec delay)".format(
631 _latency_count,
632 _latency_delay
633 )
634 )
635 _osd_lat = {
636 "total": _latency_count,
637 "delay": _latency_delay,
638 "data": []
639 }
640 _progress = Progress(_latency_count)
641 _index = 1
642 while _index <= _latency_count:
643 _progress.write_progress(_index)
644 _osd_lat["data"].append(_cj("ceph osd perf -f json"))
645 sleep(_latency_delay)
646 _index += 1
647 _progress.end()
648 self._add_ceph_info_item(
649 "osd_latency_data",
650 "OSD Latency metrics",
651 _osd_lat
652 )
653
654 return
Alex41dd0cc2022-02-09 17:33:23 -0600655
656 def gather_osd_configs(self):
657 _total_osd = len(self.ceph_info["ceph_osd_df"]["data"]["nodes"])
658 logger_cli.info(
659 "-> Gathering OSD configuration ({})".format(_total_osd)
660 )
Alex0bcf31b2022-03-29 17:38:58 -0500661 cmds = {}
662 cmd_list = []
Alex41dd0cc2022-02-09 17:33:23 -0600663 for _osd in self.ceph_info["ceph_osd_df"]["data"]["nodes"]:
Alex0bcf31b2022-03-29 17:38:58 -0500664 _cmd = "ceph config show-with-defaults -f json {}".format(
665 _osd["name"]
Alex41dd0cc2022-02-09 17:33:23 -0600666 )
Alex0bcf31b2022-03-29 17:38:58 -0500667 cmd_list.append(_cmd)
668 cmds[_osd["name"]] = _cmd
669
670 results = self.master.exec_cmds_on_pod(
671 self.ceph_pod,
672 cmd_list
673 )
674
675 logger_cli.info("-> Processing results")
676 _cfgs = {}
677 for _r in results:
678 _cmd = _r[3]
679 _j = self._as_json(_r[2])
680 for _osd_name in cmds.keys():
681 if cmds[_osd_name] == _cmd:
682 _cfgs[_osd_name] = _j
683 break
Alex41dd0cc2022-02-09 17:33:23 -0600684
685 # Process configs
686 _base = {}
687 _uniq = {}
688 logger_cli.info("-> Filtering config values")
689 _progress = Progress(_total_osd)
690 _idx = 1
691 for _osd, _data in _cfgs.items():
692 _progress.write_progress(_idx, note=_osd)
693 for _o in _data:
694 _name = _o.pop("name")
695 if not _o["value"]:
696 _o["value"] = "-"
697 if _name not in _base:
698 _base[_name] = _o
699 elif _base[_name]["value"] != _o["value"]:
700 _progress.clearline()
701 logger_cli.info(
702 "...specific value for {} (src: '{}'): {}={}".format(
703 _osd,
704 _o["source"],
705 _name,
706 _o["value"]
707 )
708 )
709 _uniq[_osd] = {
710 _name: _o
711 }
712 _idx += 1
713 _progress.end()
714 self._add_ceph_info_item(
715 "osd_config_data",
716 "OSD Configuration values",
717 {
718 "common": _base,
719 "uniq": _uniq
720 }
721 )
722 return