blob: 9b55c3f7bef157e6e1f24ff52f1bb4722b6efb07 [file] [log] [blame]
Alexdcb792f2021-10-04 14:24:21 -05001import json
2from time import sleep
3
4
5from cfg_checker.common import logger_cli
6from cfg_checker.common.exception import KubeException
7
8from cfg_checker.helpers.console_utils import Progress
9from cfg_checker.helpers.tgz import TGZFile
10from cfg_checker.nodes import KubeNodes
11from cfg_checker.reports import reporter
12
13
14class CephInfo(object):
15 def __init__(
16 self,
17 config
18 ):
19 self.env_config = config
20 return
21
22 def get_transposed_latency_table(self):
23 _table = {
24 "<dev>": []
25 }
26 for _pfd in self.ceph_info['osd_latency_data']['data']['data']:
27 _table["<dev>"].append({
28 "formatted": " cL/aL ",
29 "commit_latency_ms": "Commit, ms",
30 "apply_latency_ms": "Apply, ms",
31 "commit_latency_ns": "Commit, ns",
32 "apply_latency_ns": "Apply, ns"
33 })
34 for _f in _pfd['osdstats']['osd_perf_infos']:
35 _n = "osd_{}".format(_f['id'])
36 if _n not in _table:
37 _table[_n] = []
38 _table[_n].append({
39 "formatted": "{:>3}/{:<3}".format(
40 _f['perf_stats']['commit_latency_ms'],
41 _f['perf_stats']['apply_latency_ms'],
42 ),
43 "commit_latency_ms": _f['perf_stats']['commit_latency_ms'],
44 "apply_latency_ms": _f['perf_stats']['apply_latency_ms'],
45 "commit_latency_ns": _f['perf_stats']['commit_latency_ns'],
46 "apply_latency_ns": _f['perf_stats']['apply_latency_ns']
47 })
48 self.ceph_info['osd_latency_data']['table'] = _table
49 return _table
50
51 def get_latest_health_readout(self):
52 _h = self.ceph_info['ceph_health']['data']
53 self.ceph_info['ceph_health']['latest'] = {}
54 for _n, _d in _h.items():
55 if not _d:
56 self.ceph_info['ceph_health']['latest'][_n] = {}
57 continue
58 else:
59 # TODO: Consider filtering out or prepare data for the table
Alexdf9cc3a2021-10-12 14:37:28 -050060 _osd = _d.pop("osd_name")
61 _node_name = _d.pop("node_name")
Alexdcb792f2021-10-04 14:24:21 -050062 _date = sorted(_d.keys(), reverse=True)[0]
63 self.ceph_info['ceph_health']['date'] = _date
64 self.ceph_info['ceph_health']['latest'][_n] = _d[_date]
Alexdf9cc3a2021-10-12 14:37:28 -050065 self.ceph_info['ceph_health']['latest'][_n]["osd_name"] = _osd
66 self.ceph_info['ceph_health']['latest'][_n]["node_name"] = \
67 _node_name
Alexdcb792f2021-10-04 14:24:21 -050068
69 return self.ceph_info['ceph_health']['latest']
70
71 def print_summary(self):
72 logger_cli.info("\n# Ceph Cluster summary")
73 # Health status
74 _h = self.ceph_info['health_detail']['data']
75 logger_cli.info("Cluster status: {}".format(_h['status']))
76 for _chk, _d in _h['checks'].items():
77 logger_cli.info(
78 "+ {}: {}\n\tSummary: {}".format(
79 _chk,
80 _d['severity'],
81 _d['summary']['message']
82 )
83 )
84 logger_cli.info("\tDetails:")
85 for _item in _d['detail']:
86 logger_cli.info("\t '{}".format(_item['message']))
87
88 # OSD health metrics
89 logger_cli.info("\n# Device health metrics:")
90 _fmt = " {:45} {:^14} {:^9} {:^6} {:^6}"
91 logger_cli.info(
92 _fmt.format(
93 "Device Name",
94 "Info",
95 "Speed",
96 "SMART",
97 "Tempr."
98 )
99 )
100 _latest = self.get_latest_health_readout()
101 for _n, _d in _latest.items():
102 if not _d:
103 logger_cli.info("{:45} {:<10}".format(_n, "<empty>"))
104 continue
105
Alexdf9cc3a2021-10-12 14:37:28 -0500106 _status = _d['smart_status']['passed']
107 if "interface_speed" in _d:
108 _speed = _d['interface_speed']['current']['string']
109 else:
110 _speed = "-"
Alexdcb792f2021-10-04 14:24:21 -0500111
112 _status = 'passed' if _status else 'failed'
113 logger_cli.info(
114 _fmt.format(
115 _n,
116 _d['device']['info_name'],
Alexdf9cc3a2021-10-12 14:37:28 -0500117 _speed,
Alexdcb792f2021-10-04 14:24:21 -0500118 _status,
119 _d['temperature']['current']
120 )
121 )
122
123 # Latency table
124 logger_cli.info(
125 "\n# OSD Latency data ({} iterations, {} sec delay), "
126 "table items 'osd_dev: N:cL/aL'\n"
127 " 'Commit Latency' -> 'cL', 'Apply Latency' -> 'aL'\n".format(
128 self.ceph_info['osd_latency_data']['data']['total'],
129 self.ceph_info['osd_latency_data']['data']['delay']
130 )
131 )
132 _strs = self.get_transposed_latency_table()
133 for _osd, _list in _strs.items():
134 _row = [c["formatted"] for c in _list]
135 logger_cli.info(
136 " {:8}: {}".format(
137 _osd,
138 " ".join(_row)
139 )
140 )
141 logger_cli.info("\n")
142
143 # critical config values
144 # TODO: print/calculate config values
145
146 return
147
148 def dump_info(self):
149 with open('cephdump.json', 'wt') as _f:
150 _f.write(json.dumps(self.ceph_info, indent=2))
151
152 def load_info(self):
153 with open('cephdump.json', 'rt') as _f:
154 self.ceph_info = json.load(_f)
155
156 def generate_archive(self, tgzfilename):
157 if not self.ceph_info:
158 logger_cli.warning(
159 "WARNING: Ceph Info Data not detected. "
160 "Consider check for errors in log."
161 )
162 else:
163 # Create Archive
164 logger_cli.info("-> Generating archive '{}'".format(tgzfilename))
165 _tgz = TGZFile(
166 tgzfilename,
167 label="MCP Checker: Generated Ceph Information"
168 )
169 # Iterate every key and write data to tar file
170 for key, d in self.ceph_info.items():
171 _filename = None
172 # Cast buf to a proper type
173 _buf = None
174 if isinstance(d["data"], dict) or isinstance(d["data"], list):
175 _buf = json.dumps(d["data"], indent=2)
176 _filename = key + ".json"
177 elif isinstance(d["data"], str):
178 _buf = d["data"]
179 _filename = key + ".txt"
180 else:
181 _buf = str(d["data"])
182 _filename = key + ".txt"
183 logger_cli.debug("... writing '{}'".format(_filename))
184 _tgz.add_file(_filename, buf=_buf, replace=True)
185
186 return
187
188 def create_html_report(self, filename):
189 """
190 Create static html showing ceph info report
191
192 :return: none
193 """
194 logger_cli.info("### Generating report to '{}'".format(filename))
195 _report = reporter.ReportToFile(
196 reporter.HTMLCephInfo(self),
197 filename
198 )
199 _report(
200 {
201 "info": self.ceph_info,
202 "cluster": self.cluster_info,
203 "nodes": self.nodes,
204 "ceph_version": self.ceph_version,
205 }
206 )
207 logger_cli.info("-> Done")
208
209 return
210
211
212class SaltCephInfo(CephInfo):
213 def __init__(
214 self,
215 config
216 ):
217 logger_cli.warning("\nWARNING: Not impelented for Salt environment!\n")
218
219 # self.master = SaltNodes(config)
220 super(SaltCephInfo, self).__init__(config)
221 return
222
223
224class KubeCephInfo(CephInfo):
225 ceph_ns = "rook-ceph"
226 ceph_app_label = "rook-ceph-tools"
227 ceph_group = "ceph.rook.io"
228 ceph_apiversion = "v1"
229 ceph_plural = "cephclusters"
230 ceph_version = "unknown"
231
232 def __init__(self, config):
233 self.master = KubeNodes(config)
234 super(KubeCephInfo, self).__init__(config)
235 # Init ceph tools pod
236 self.pod_name = self._get_tools_pod_name()
237 self.ceph_info = {}
238 self.cluster_info = {}
239 self.ceph_version = self.get_ceph_cluster_config()
240
241 def _safe_tools_cmd(self, cmd, expect_output=True):
242 _r = self.master.exec_cmd_on_target_pod(
243 self.pod_name,
244 self.ceph_ns,
245 cmd
246 )
247 if expect_output and not _r:
248 logger_cli.debug("... got empty output for '{}'".format(cmd))
249 elif not expect_output and _r:
250 logger_cli.warning(
251 "WARNING: Unexpected output for '{}':\n"
252 "===== Start\n{}\n===== End".format(cmd, _r)
253 )
254 return _r
255
256 def _safe_get_cmd_output_as_json(self, cmd):
257 _buf = self._safe_tools_cmd(cmd)
258 try:
259 return json.loads(_buf)
260 except ValueError:
261 logger_cli.error(
262 "\nERROR: failed to parse json: '{}'".format(_buf)
263 )
264 return _buf
265
266 def _get_tools_pod_name(self):
267 # get ceph pod
268 _names = self.master.kube.get_pod_names_by_partial_name(
269 self.ceph_app_label,
270 self.ceph_ns
271 )
272 if not _names:
273 raise KubeException(
274 "Failed to find pod using '{}'".format(self.ceph_app_label)
275 )
276 elif len(_names) > 1:
277 logger_cli.warning(
278 "WARNING: Environment has more than one pod "
279 "with '{}' app: {}".format(
280 self.ceph_app_label,
281 ", ".join(_names)
282 )
283 )
284 else:
285 logger_cli.debug("... found '{}'".format(_names[0]))
286 return _names[0]
287
288 def _add_ceph_info_item(self, key, title, data):
289 if key in self.ceph_info:
290 self.ceph_info[key]["title"] = title
291 self.ceph_info[key]["data"] = data
292 else:
293 self.ceph_info[key] = {
294 "title": title,
295 "data": data
296 }
297
298 def _parse_dev_classes(self, deviceClasses):
299 _devClasses = []
300 for _i in deviceClasses:
301 _devClasses += list(_i.values())
302 return set(_devClasses)
303
304 def get_ceph_cluster_config(self):
305 # get cephclusters resource
306 logger_cli.info("# Loading '{}' object of type '{}/{}'".format(
307 self.ceph_plural,
308 self.ceph_group,
309 self.ceph_apiversion
310 ))
311 _r = self.master.kube.get_custom_resource(
312 self.ceph_group,
313 self.ceph_apiversion,
314 self.ceph_plural,
315 )
316 # find cluster
317 _cluster = None
318 if len(_r['items']) < 1:
319 logger_cli.warning(
320 "WARNING: Failed to find '{}' ({}/{})".format(
321 self.ceph_plural,
322 self.ceph_group,
323 self.ceph_apiversion
324 )
325 )
326 return 'uknown'
327 elif len(_r['items']) > 1:
328 logger_cli.warning(
329 "WARNING: Multiple clusters found '{}' ({}/{})".format(
330 self.ceph_plural,
331 self.ceph_group,
332 self.ceph_apiversion
333 )
334 )
335 _cluster = _r['items'][0]
336 _s = _cluster['status']
337 self.cluster_info.update({
338 'image': _s['version']['image'],
339 'version': _s['version']['version'],
340 'device_classes': self._parse_dev_classes(
341 _s['storage']['deviceClasses']
342 ),
343 'phase': _s['phase'],
344 'state': _s['state'],
345 'health': _s['ceph']['health'],
346 'previousHealth': _s['ceph']['previousHealth'],
347 'lastChanged': _s['ceph']['lastChanged'],
348 'lastChecked': _s['ceph']['lastChecked'],
349 'mon_count': _cluster['spec']['mon']['count']
350 })
351 self.nodes = _cluster['spec']['storage']['nodes'],
352 logger_cli.info("-> Found Ceph cluster: {} ({})".format(
353 self.cluster_info['version'],
354 self.cluster_info['image']
355 ))
356 return self.cluster_info['version']
357
358 def gather_info(self):
359 logger_cli.info("# Gathering Ceph cluster info")
360 # Collect info
361 _c = self._safe_tools_cmd
362 _cj = self._safe_get_cmd_output_as_json
363 # Crush Map
364 logger_cli.info("-> Collecting CRUSH map")
365 _cmap_tmp_path = "/tmp/crushmap.bin"
366 _r = _c(
367 "ceph osd getcrushmap -o " + _cmap_tmp_path,
368 expect_output=False
369 )
370 # TODO: Handle errors in _r
371 logger_cli.debug("... 'getcrushmap' return value is: '{}'".format(_r))
372
373 # Get Crush map as json and text
374 self._add_ceph_info_item(
375 "crushmap_json",
376 "Crush Map (json)",
377 _cj("crushtool -i " + _cmap_tmp_path + " --dump")
378 )
379 # _crushmap = _cj("crushtool -i " + _cmap_tmp_path + " --dump")
380 self._add_ceph_info_item(
381 "crushmap_text",
382 "Crush Map (text)",
383 _c("crushtool -d " + _cmap_tmp_path)
384 )
385
386 logger_cli.info("-> Collecting ceph osd crush dump")
387 self._add_ceph_info_item(
388 "osd_crushdump",
389 "Crush dump (osd)",
390 _cj("ceph osd crush dump")
391 )
392
393 logger_cli.info("-> Collecting cluster status")
394 self._add_ceph_info_item(
395 "cluster_status",
396 "Cluster status",
397 _cj("ceph -s -f json")
398 )
399
400 logger_cli.info("-> Collecting health detail")
401 self._add_ceph_info_item(
402 "health_detail",
403 "Health details",
404 _cj("ceph -f json health detail")
405 )
406
407 logger_cli.info("-> Collecting monmap")
408 self._add_ceph_info_item(
409 "monmap",
410 "Ceph Mon map",
411 _cj("ceph mon dump -f json")
412 )
413
414 logger_cli.info("-> Collecting ceph df")
415 self._add_ceph_info_item(
416 "ceph_df",
417 "Ceph DF",
418 _cj("ceph df -f json")
419 )
420
421 logger_cli.info("-> Collecting ceph osd df")
422 self._add_ceph_info_item(
423 "ceph_osd_df",
424 "Ceph OSD DF",
425 _cj("ceph osd df -f json")
426 )
427
428 logger_cli.info("-> Collecting ceph osd dump")
429 self._add_ceph_info_item(
430 "ceph_osd_dump",
431 "Ceph OSD dump",
432 _cj("ceph osd dump -f json")
433 )
434
435 logger_cli.info("-> Collecting rados df")
436 self._add_ceph_info_item(
437 "rados_df",
438 "Rados DF",
439 _cj("rados df -f json")
440 )
441
442 logger_cli.info("-> Collecting ceph report")
443 self._add_ceph_info_item(
444 "ceph_report",
445 "Ceph Report",
446 _cj("ceph report")
447 )
448
449 logger_cli.info("-> Collecting auth data anonymized")
450 _auth_data = _cj("ceph auth list -f json")
451 # Anonymize data
452 # _cj("ceph auth list -f json | sed 's/AQ[^=]*==/KEY/g'")
453 for item in _auth_data["auth_dump"]:
454 if "key" in item:
455 item['key'] = "key-data-redacted"
456 self._add_ceph_info_item(
457 "ceph_auth_ls",
458 "Ceph Auth Data (anonymized)",
459 _auth_data
460 )
461
462 logger_cli.info("-> Collecting ceph pg dump")
463 self._add_ceph_info_item(
464 "ceph_pg_dump",
465 "Ceph PG dump",
466 _cj("ceph pg dump -f json")
467 )
468
469 logger_cli.info("-> Collecting ceph running configuration")
470 self._add_ceph_info_item(
471 "ceph_config_dump",
472 "Ceph Configuration Dump",
473 _cj("ceph config dump -f json")
474 )
475
476 logger_cli.info("-> Collecting health metrics")
477 _health_metrics = {}
478 _devices = _c("ceph device ls")
Alexdf9cc3a2021-10-12 14:37:28 -0500479 _devices = _devices.splitlines()
480 _progress = Progress(len(_devices)-1)
481 _index = 1
482 for device in _devices:
Alexdcb792f2021-10-04 14:24:21 -0500483 _t = device.split()
484 _osd = _t[2]
Alexdf9cc3a2021-10-12 14:37:28 -0500485 _node = _t[1]
Alexdcb792f2021-10-04 14:24:21 -0500486 _dev = _t[0]
487 if _dev == "DEVICE":
488 continue
489 _metric = _cj("ceph device get-health-metrics {}".format(_dev))
Alexdf9cc3a2021-10-12 14:37:28 -0500490 _dev_name = "{}_{}".format(_osd, _dev)
491 _health_metrics[_dev_name] = _metric
492 _health_metrics[_dev_name]['node_name'] = _node
493 _health_metrics[_dev_name]['osd_name'] = _osd
494 _progress.write_progress(_index, note=_dev_name)
495 _index += 1
496 _progress.end()
Alexdcb792f2021-10-04 14:24:21 -0500497 self._add_ceph_info_item(
498 "ceph_health",
499 "Ceph Health Metrics",
500 _health_metrics
501 )
502
503 # Latency values
504 # config const for set
505 _latency_count = 10
506 _latency_delay = 4
507 logger_cli.info(
508 "-> Collecting ceph osd latency data "
509 "({} total, {} sec delay)".format(
510 _latency_count,
511 _latency_delay
512 )
513 )
514 _osd_lat = {
515 "total": _latency_count,
516 "delay": _latency_delay,
517 "data": []
518 }
519 _progress = Progress(_latency_count)
520 _index = 1
521 while _index <= _latency_count:
522 _progress.write_progress(_index)
523 _osd_lat["data"].append(_cj("ceph osd perf -f json"))
524 sleep(_latency_delay)
525 _index += 1
526 _progress.end()
527 self._add_ceph_info_item(
528 "osd_latency_data",
529 "OSD Latency metrics",
530 _osd_lat
531 )
532
533 return