blob: 092c1c7995a6920d4e060549e8c3da6db16a1e1b [file] [log] [blame]
Alexdcb792f2021-10-04 14:24:21 -05001import json
2from time import sleep
3
4
5from cfg_checker.common import logger_cli
6from cfg_checker.common.exception import KubeException
7
8from cfg_checker.helpers.console_utils import Progress
9from cfg_checker.helpers.tgz import TGZFile
10from cfg_checker.nodes import KubeNodes
11from cfg_checker.reports import reporter
12
13
14class CephInfo(object):
15 def __init__(
16 self,
17 config
18 ):
19 self.env_config = config
20 return
21
22 def get_transposed_latency_table(self):
23 _table = {
24 "<dev>": []
25 }
26 for _pfd in self.ceph_info['osd_latency_data']['data']['data']:
27 _table["<dev>"].append({
28 "formatted": " cL/aL ",
29 "commit_latency_ms": "Commit, ms",
30 "apply_latency_ms": "Apply, ms",
31 "commit_latency_ns": "Commit, ns",
32 "apply_latency_ns": "Apply, ns"
33 })
34 for _f in _pfd['osdstats']['osd_perf_infos']:
35 _n = "osd_{}".format(_f['id'])
36 if _n not in _table:
37 _table[_n] = []
38 _table[_n].append({
39 "formatted": "{:>3}/{:<3}".format(
40 _f['perf_stats']['commit_latency_ms'],
41 _f['perf_stats']['apply_latency_ms'],
42 ),
43 "commit_latency_ms": _f['perf_stats']['commit_latency_ms'],
44 "apply_latency_ms": _f['perf_stats']['apply_latency_ms'],
45 "commit_latency_ns": _f['perf_stats']['commit_latency_ns'],
46 "apply_latency_ns": _f['perf_stats']['apply_latency_ns']
47 })
48 self.ceph_info['osd_latency_data']['table'] = _table
49 return _table
50
51 def get_latest_health_readout(self):
52 _h = self.ceph_info['ceph_health']['data']
53 self.ceph_info['ceph_health']['latest'] = {}
54 for _n, _d in _h.items():
55 if not _d:
56 self.ceph_info['ceph_health']['latest'][_n] = {}
57 continue
58 else:
59 # TODO: Consider filtering out or prepare data for the table
60 _date = sorted(_d.keys(), reverse=True)[0]
61 self.ceph_info['ceph_health']['date'] = _date
62 self.ceph_info['ceph_health']['latest'][_n] = _d[_date]
63
64 return self.ceph_info['ceph_health']['latest']
65
66 def print_summary(self):
67 logger_cli.info("\n# Ceph Cluster summary")
68 # Health status
69 _h = self.ceph_info['health_detail']['data']
70 logger_cli.info("Cluster status: {}".format(_h['status']))
71 for _chk, _d in _h['checks'].items():
72 logger_cli.info(
73 "+ {}: {}\n\tSummary: {}".format(
74 _chk,
75 _d['severity'],
76 _d['summary']['message']
77 )
78 )
79 logger_cli.info("\tDetails:")
80 for _item in _d['detail']:
81 logger_cli.info("\t '{}".format(_item['message']))
82
83 # OSD health metrics
84 logger_cli.info("\n# Device health metrics:")
85 _fmt = " {:45} {:^14} {:^9} {:^6} {:^6}"
86 logger_cli.info(
87 _fmt.format(
88 "Device Name",
89 "Info",
90 "Speed",
91 "SMART",
92 "Tempr."
93 )
94 )
95 _latest = self.get_latest_health_readout()
96 for _n, _d in _latest.items():
97 if not _d:
98 logger_cli.info("{:45} {:<10}".format(_n, "<empty>"))
99 continue
100
101 _status = _d['ata_smart_data']['self_test']['status']['passed']
102
103 _status = 'passed' if _status else 'failed'
104 logger_cli.info(
105 _fmt.format(
106 _n,
107 _d['device']['info_name'],
108 _d['interface_speed']['current']['string'],
109 _status,
110 _d['temperature']['current']
111 )
112 )
113
114 # Latency table
115 logger_cli.info(
116 "\n# OSD Latency data ({} iterations, {} sec delay), "
117 "table items 'osd_dev: N:cL/aL'\n"
118 " 'Commit Latency' -> 'cL', 'Apply Latency' -> 'aL'\n".format(
119 self.ceph_info['osd_latency_data']['data']['total'],
120 self.ceph_info['osd_latency_data']['data']['delay']
121 )
122 )
123 _strs = self.get_transposed_latency_table()
124 for _osd, _list in _strs.items():
125 _row = [c["formatted"] for c in _list]
126 logger_cli.info(
127 " {:8}: {}".format(
128 _osd,
129 " ".join(_row)
130 )
131 )
132 logger_cli.info("\n")
133
134 # critical config values
135 # TODO: print/calculate config values
136
137 return
138
139 def dump_info(self):
140 with open('cephdump.json', 'wt') as _f:
141 _f.write(json.dumps(self.ceph_info, indent=2))
142
143 def load_info(self):
144 with open('cephdump.json', 'rt') as _f:
145 self.ceph_info = json.load(_f)
146
147 def generate_archive(self, tgzfilename):
148 if not self.ceph_info:
149 logger_cli.warning(
150 "WARNING: Ceph Info Data not detected. "
151 "Consider check for errors in log."
152 )
153 else:
154 # Create Archive
155 logger_cli.info("-> Generating archive '{}'".format(tgzfilename))
156 _tgz = TGZFile(
157 tgzfilename,
158 label="MCP Checker: Generated Ceph Information"
159 )
160 # Iterate every key and write data to tar file
161 for key, d in self.ceph_info.items():
162 _filename = None
163 # Cast buf to a proper type
164 _buf = None
165 if isinstance(d["data"], dict) or isinstance(d["data"], list):
166 _buf = json.dumps(d["data"], indent=2)
167 _filename = key + ".json"
168 elif isinstance(d["data"], str):
169 _buf = d["data"]
170 _filename = key + ".txt"
171 else:
172 _buf = str(d["data"])
173 _filename = key + ".txt"
174 logger_cli.debug("... writing '{}'".format(_filename))
175 _tgz.add_file(_filename, buf=_buf, replace=True)
176
177 return
178
179 def create_html_report(self, filename):
180 """
181 Create static html showing ceph info report
182
183 :return: none
184 """
185 logger_cli.info("### Generating report to '{}'".format(filename))
186 _report = reporter.ReportToFile(
187 reporter.HTMLCephInfo(self),
188 filename
189 )
190 _report(
191 {
192 "info": self.ceph_info,
193 "cluster": self.cluster_info,
194 "nodes": self.nodes,
195 "ceph_version": self.ceph_version,
196 }
197 )
198 logger_cli.info("-> Done")
199
200 return
201
202
203class SaltCephInfo(CephInfo):
204 def __init__(
205 self,
206 config
207 ):
208 logger_cli.warning("\nWARNING: Not impelented for Salt environment!\n")
209
210 # self.master = SaltNodes(config)
211 super(SaltCephInfo, self).__init__(config)
212 return
213
214
215class KubeCephInfo(CephInfo):
216 ceph_ns = "rook-ceph"
217 ceph_app_label = "rook-ceph-tools"
218 ceph_group = "ceph.rook.io"
219 ceph_apiversion = "v1"
220 ceph_plural = "cephclusters"
221 ceph_version = "unknown"
222
223 def __init__(self, config):
224 self.master = KubeNodes(config)
225 super(KubeCephInfo, self).__init__(config)
226 # Init ceph tools pod
227 self.pod_name = self._get_tools_pod_name()
228 self.ceph_info = {}
229 self.cluster_info = {}
230 self.ceph_version = self.get_ceph_cluster_config()
231
232 def _safe_tools_cmd(self, cmd, expect_output=True):
233 _r = self.master.exec_cmd_on_target_pod(
234 self.pod_name,
235 self.ceph_ns,
236 cmd
237 )
238 if expect_output and not _r:
239 logger_cli.debug("... got empty output for '{}'".format(cmd))
240 elif not expect_output and _r:
241 logger_cli.warning(
242 "WARNING: Unexpected output for '{}':\n"
243 "===== Start\n{}\n===== End".format(cmd, _r)
244 )
245 return _r
246
247 def _safe_get_cmd_output_as_json(self, cmd):
248 _buf = self._safe_tools_cmd(cmd)
249 try:
250 return json.loads(_buf)
251 except ValueError:
252 logger_cli.error(
253 "\nERROR: failed to parse json: '{}'".format(_buf)
254 )
255 return _buf
256
257 def _get_tools_pod_name(self):
258 # get ceph pod
259 _names = self.master.kube.get_pod_names_by_partial_name(
260 self.ceph_app_label,
261 self.ceph_ns
262 )
263 if not _names:
264 raise KubeException(
265 "Failed to find pod using '{}'".format(self.ceph_app_label)
266 )
267 elif len(_names) > 1:
268 logger_cli.warning(
269 "WARNING: Environment has more than one pod "
270 "with '{}' app: {}".format(
271 self.ceph_app_label,
272 ", ".join(_names)
273 )
274 )
275 else:
276 logger_cli.debug("... found '{}'".format(_names[0]))
277 return _names[0]
278
279 def _add_ceph_info_item(self, key, title, data):
280 if key in self.ceph_info:
281 self.ceph_info[key]["title"] = title
282 self.ceph_info[key]["data"] = data
283 else:
284 self.ceph_info[key] = {
285 "title": title,
286 "data": data
287 }
288
289 def _parse_dev_classes(self, deviceClasses):
290 _devClasses = []
291 for _i in deviceClasses:
292 _devClasses += list(_i.values())
293 return set(_devClasses)
294
295 def get_ceph_cluster_config(self):
296 # get cephclusters resource
297 logger_cli.info("# Loading '{}' object of type '{}/{}'".format(
298 self.ceph_plural,
299 self.ceph_group,
300 self.ceph_apiversion
301 ))
302 _r = self.master.kube.get_custom_resource(
303 self.ceph_group,
304 self.ceph_apiversion,
305 self.ceph_plural,
306 )
307 # find cluster
308 _cluster = None
309 if len(_r['items']) < 1:
310 logger_cli.warning(
311 "WARNING: Failed to find '{}' ({}/{})".format(
312 self.ceph_plural,
313 self.ceph_group,
314 self.ceph_apiversion
315 )
316 )
317 return 'uknown'
318 elif len(_r['items']) > 1:
319 logger_cli.warning(
320 "WARNING: Multiple clusters found '{}' ({}/{})".format(
321 self.ceph_plural,
322 self.ceph_group,
323 self.ceph_apiversion
324 )
325 )
326 _cluster = _r['items'][0]
327 _s = _cluster['status']
328 self.cluster_info.update({
329 'image': _s['version']['image'],
330 'version': _s['version']['version'],
331 'device_classes': self._parse_dev_classes(
332 _s['storage']['deviceClasses']
333 ),
334 'phase': _s['phase'],
335 'state': _s['state'],
336 'health': _s['ceph']['health'],
337 'previousHealth': _s['ceph']['previousHealth'],
338 'lastChanged': _s['ceph']['lastChanged'],
339 'lastChecked': _s['ceph']['lastChecked'],
340 'mon_count': _cluster['spec']['mon']['count']
341 })
342 self.nodes = _cluster['spec']['storage']['nodes'],
343 logger_cli.info("-> Found Ceph cluster: {} ({})".format(
344 self.cluster_info['version'],
345 self.cluster_info['image']
346 ))
347 return self.cluster_info['version']
348
349 def gather_info(self):
350 logger_cli.info("# Gathering Ceph cluster info")
351 # Collect info
352 _c = self._safe_tools_cmd
353 _cj = self._safe_get_cmd_output_as_json
354 # Crush Map
355 logger_cli.info("-> Collecting CRUSH map")
356 _cmap_tmp_path = "/tmp/crushmap.bin"
357 _r = _c(
358 "ceph osd getcrushmap -o " + _cmap_tmp_path,
359 expect_output=False
360 )
361 # TODO: Handle errors in _r
362 logger_cli.debug("... 'getcrushmap' return value is: '{}'".format(_r))
363
364 # Get Crush map as json and text
365 self._add_ceph_info_item(
366 "crushmap_json",
367 "Crush Map (json)",
368 _cj("crushtool -i " + _cmap_tmp_path + " --dump")
369 )
370 # _crushmap = _cj("crushtool -i " + _cmap_tmp_path + " --dump")
371 self._add_ceph_info_item(
372 "crushmap_text",
373 "Crush Map (text)",
374 _c("crushtool -d " + _cmap_tmp_path)
375 )
376
377 logger_cli.info("-> Collecting ceph osd crush dump")
378 self._add_ceph_info_item(
379 "osd_crushdump",
380 "Crush dump (osd)",
381 _cj("ceph osd crush dump")
382 )
383
384 logger_cli.info("-> Collecting cluster status")
385 self._add_ceph_info_item(
386 "cluster_status",
387 "Cluster status",
388 _cj("ceph -s -f json")
389 )
390
391 logger_cli.info("-> Collecting health detail")
392 self._add_ceph_info_item(
393 "health_detail",
394 "Health details",
395 _cj("ceph -f json health detail")
396 )
397
398 logger_cli.info("-> Collecting monmap")
399 self._add_ceph_info_item(
400 "monmap",
401 "Ceph Mon map",
402 _cj("ceph mon dump -f json")
403 )
404
405 logger_cli.info("-> Collecting ceph df")
406 self._add_ceph_info_item(
407 "ceph_df",
408 "Ceph DF",
409 _cj("ceph df -f json")
410 )
411
412 logger_cli.info("-> Collecting ceph osd df")
413 self._add_ceph_info_item(
414 "ceph_osd_df",
415 "Ceph OSD DF",
416 _cj("ceph osd df -f json")
417 )
418
419 logger_cli.info("-> Collecting ceph osd dump")
420 self._add_ceph_info_item(
421 "ceph_osd_dump",
422 "Ceph OSD dump",
423 _cj("ceph osd dump -f json")
424 )
425
426 logger_cli.info("-> Collecting rados df")
427 self._add_ceph_info_item(
428 "rados_df",
429 "Rados DF",
430 _cj("rados df -f json")
431 )
432
433 logger_cli.info("-> Collecting ceph report")
434 self._add_ceph_info_item(
435 "ceph_report",
436 "Ceph Report",
437 _cj("ceph report")
438 )
439
440 logger_cli.info("-> Collecting auth data anonymized")
441 _auth_data = _cj("ceph auth list -f json")
442 # Anonymize data
443 # _cj("ceph auth list -f json | sed 's/AQ[^=]*==/KEY/g'")
444 for item in _auth_data["auth_dump"]:
445 if "key" in item:
446 item['key'] = "key-data-redacted"
447 self._add_ceph_info_item(
448 "ceph_auth_ls",
449 "Ceph Auth Data (anonymized)",
450 _auth_data
451 )
452
453 logger_cli.info("-> Collecting ceph pg dump")
454 self._add_ceph_info_item(
455 "ceph_pg_dump",
456 "Ceph PG dump",
457 _cj("ceph pg dump -f json")
458 )
459
460 logger_cli.info("-> Collecting ceph running configuration")
461 self._add_ceph_info_item(
462 "ceph_config_dump",
463 "Ceph Configuration Dump",
464 _cj("ceph config dump -f json")
465 )
466
467 logger_cli.info("-> Collecting health metrics")
468 _health_metrics = {}
469 _devices = _c("ceph device ls")
470 for device in _devices.splitlines():
471 _t = device.split()
472 _osd = _t[2]
473 _dev = _t[0]
474 if _dev == "DEVICE":
475 continue
476 _metric = _cj("ceph device get-health-metrics {}".format(_dev))
477 _health_metrics["{}_{}".format(_osd, _dev)] = _metric
478 self._add_ceph_info_item(
479 "ceph_health",
480 "Ceph Health Metrics",
481 _health_metrics
482 )
483
484 # Latency values
485 # config const for set
486 _latency_count = 10
487 _latency_delay = 4
488 logger_cli.info(
489 "-> Collecting ceph osd latency data "
490 "({} total, {} sec delay)".format(
491 _latency_count,
492 _latency_delay
493 )
494 )
495 _osd_lat = {
496 "total": _latency_count,
497 "delay": _latency_delay,
498 "data": []
499 }
500 _progress = Progress(_latency_count)
501 _index = 1
502 while _index <= _latency_count:
503 _progress.write_progress(_index)
504 _osd_lat["data"].append(_cj("ceph osd perf -f json"))
505 sleep(_latency_delay)
506 _index += 1
507 _progress.end()
508 self._add_ceph_info_item(
509 "osd_latency_data",
510 "OSD Latency metrics",
511 _osd_lat
512 )
513
514 return