Blame - cfg_checker/modules/ceph/__init__.py - mcp/cfg-checker

blob: 29b9a2980387f3c19d9f5a7970254d2620ff94a5 [file] [log] [blame]

Alex	0989ecf	2022-03-29 13:43:21 -0500	[diff] [blame^]	1	# Author: Alex Savatieiev (osavatieiev@mirantis.com; a.savex@gmail.com)
				2	# Copyright 2019-2022 Mirantis, Inc.
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	3	from cfg_checker.agent.fio_runner import get_fio_options
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	4	from cfg_checker.agent.fio_runner import seq_modes, mix_modes
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	5	from cfg_checker.common import logger_cli
				6	from cfg_checker.common.settings import ENV_TYPE_KUBE
				7	from cfg_checker.helpers import args_utils
				8	from cfg_checker.modules.ceph import info, bench
				9
				10	command_help = "Ceph Storage information and benchmarks"
				11	supported_envs = [ENV_TYPE_KUBE]
				12
				13
				14	# def _selectClass(_env, strClassHint="checker"):
				15	# _class = None
				16	# if _env == ENV_TYPE_SALT:
				17	# if strClassHint == "info":
				18	# _class = info.SaltCephInfo
				19	# elif strClassHint == "bench":
				20	# _class = bench.SaltCephInfo
				21	# elif _env == ENV_TYPE_KUBE:
				22	# if strClassHint == "info":
				23	# _class = info.KubeCephInfo
				24	# elif strClassHint == "bench":
				25	# _class = bench.KubeCephBench
				26	# if not _class:
				27	# raise CheckerException(
				28	# "Unknown hint for selecting Ceph handler Class: '{}'".format(
				29	# strClassHint
				30	# )
				31	# )
				32	# else:
				33	# return _class
				34
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	35	def _get_param_and_log(arg, param_str):
				36	_value = args_utils.get_arg(arg, param_str)
				37	logger_cli.info(" {}={}".format(param_str, _value))
				38	return _value
				39
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	40
				41	def init_parser(_parser):
				42	# network subparser
				43	ceph_subparsers = _parser.add_subparsers(dest='type')
				44
				45	ceph_info_parser = ceph_subparsers.add_parser(
				46	'info',
				47	help="Gather Ceph Cluster information"
				48	)
				49
				50	ceph_info_parser.add_argument(
				51	'--detailed',
				52	action="store_true", default=False,
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	53	help="Print additional details. (Not implemented yet)"
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	54	)
				55
				56	ceph_info_parser.add_argument(
				57	'--tgz',
				58	metavar='ceph_tgz_filename',
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	59	help="TGZ archive filename to save gathered data"
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	60	)
				61
				62	ceph_report_parser = ceph_subparsers.add_parser(
				63	'report',
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	64	help="Generate Ceph Info report"
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	65	)
				66
				67	ceph_report_parser.add_argument(
				68	'--html',
				69	metavar='ceph_html_filename',
				70	help="HTML filename to save report"
				71	)
				72
				73	ceph_bench_parser = ceph_subparsers.add_parser(
				74	'bench',
				75	help="Run ceph benchmark"
				76	)
				77
				78	ceph_bench_parser.add_argument(
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	79	'--agents',
				80	type=int, metavar='agent_count', default=5,
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	81	help="Number of agents to use in all test runs. Default: 5"
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	82	)
				83	ceph_bench_parser.add_argument(
				84	'--html',
				85	metavar='ceph_html_filename',
				86	help="HTML filename to save report"
				87	)
				88	ceph_bench_parser.add_argument(
				89	'--storage-class',
				90	metavar='storage_class',
				91	help="Storage class to be used in benchmark"
				92	)
				93	ceph_bench_parser.add_argument(
				94	'--task-file',
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	95	metavar='task_file',
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	96	help="Task file for benchmark with parameters to use"
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	97	)
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	98	ceph_bench_parser.add_argument(
				99	'--no-cleanup',
				100	action="store_true", default=False,
				101	help="Do not cleanup services, agents, pvc, and pv"
				102	)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	103	ceph_bench_parser.add_argument(
				104	'--cleanup-only',
				105	action="store_true", default=False,
				106	help="Cleanup resources related to benchmark"
				107	)
				108	ceph_bench_parser.add_argument(
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	109	'--report-only',
				110	action="store_true", default=False,
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	111	help="Just create report using files in '--dump-path' folder"
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	112	)
				113	ceph_bench_parser.add_argument(
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	114	'--dump-path',
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	115	metavar="dump_results",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	116	help="Dump result after each test run to use them later. "
				117	"Default: '/tmp'"
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	118	)
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	119	ceph_bench_parser.add_argument(
				120	'--name',
				121	metavar="name", default="cephbench",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	122	help="Job name to use for running fio. "
				123	"Can be used to grep results. Default: 'cephbench'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	124	)
				125	ceph_bench_parser.add_argument(
				126	'--bs',
				127	metavar="blocksize", default="16k",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	128	help="Block size for single run. Default: '16k'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	129	)
				130	ceph_bench_parser.add_argument(
				131	'--iodepth',
				132	metavar="iodepth", default="16",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	133	help="IO Depth for single run. Default: '16'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	134	)
				135	ceph_bench_parser.add_argument(
				136	'--size',
				137	metavar="size", default="10G",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	138	help="Persistent volume size (M, G). Default: '10G'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	139	)
				140	ceph_bench_parser.add_argument(
				141	'--readwrite',
				142	metavar="readwrite", default="randrw",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	143	help="Test mode for single run (read, write, randrw, "
				144	"randread, randwrite). Default: 'randrw'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	145	)
				146	ceph_bench_parser.add_argument(
				147	'--rwmixread',
				148	metavar="rwmixread", default="50",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	149	help="Percent of read in random mixed mode (randrw). Default: '50'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	150	)
				151	ceph_bench_parser.add_argument(
				152	'--ramp-time',
				153	metavar="ramp_time", default="5s",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	154	help="Warmup time before test. Default: '5s'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	155	)
				156	ceph_bench_parser.add_argument(
				157	'--runtime',
				158	metavar="runtime", default="60s",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	159	help="How long to run test. Default: '60s'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	160	)
				161	ceph_bench_parser.add_argument(
				162	'--ioengine',
				163	metavar="ioengine", default="libaio",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	164	help="IO Engine used by fio. See 'fio eng-help' output for list. "
				165	"Default: 'libaio'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	166	)
				167	ceph_bench_parser.add_argument(
				168	'--offset-increment',
				169	metavar="offset_increment", default="500M",
Alex	41dd0cc	2022-02-09 17:33:23 -0600	[diff] [blame]	170	help="Offset to be used in 'read' and 'write' modes if multiple jobs "
				171	"used"
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	172	"Default: '500M'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	173	)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	174
				175	return _parser
				176
				177
				178	def do_info(args, config):
				179	# Ceph info
				180	# Gather ceph info and create an archive with data
				181	args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
				182	# check tgz
				183	_tgzfile = "ceph_info_archive.tgz" if not args.tgz else args.tgz
				184
				185	# _class = _selectClass(_env)
				186	ceph_info = info.KubeCephInfo(config)
				187
				188	logger_cli.info("# Collecting Ceph cluster information")
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	189	ceph_info.gather_info()
Alex	41dd0cc	2022-02-09 17:33:23 -0600	[diff] [blame]	190	ceph_info.gather_osd_configs()
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	191
				192	# Debug, enable if needed to debug report generation
				193	# without actuall data collecting each time
				194	# ceph_info.dump_info()
				195	# ceph_info.load_info()
				196	# end debug
				197
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	198	ceph_info.generate_archive(_tgzfile)
Alex	df9cc3a	2021-10-12 14:37:28 -0500	[diff] [blame]	199	ceph_info.print_summary()
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	200
				201	return
				202
				203
				204	def do_report(args, config):
				205	# Ceph Report
				206	# Gather ceph info and create HTML report with all of the data
				207	args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
				208	_filename = args_utils.get_arg(args, 'html')
				209	logger_cli.info("# Ceph cluster Configuration report")
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	210
				211	# _class = _selectClass(_env)
				212	ceph_info = info.KubeCephInfo(config)
				213	# Debug, enable if needed to debug report generation
				214	# without actuall data collecting each time
				215	# ceph_info.load_info()
				216	# end debug
				217	ceph_info.gather_info()
Alex	41dd0cc	2022-02-09 17:33:23 -0600	[diff] [blame]	218	ceph_info.gather_osd_configs()
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	219	ceph_info.get_transposed_latency_table()
				220	ceph_info.get_latest_health_readout()
				221	ceph_info.create_html_report(_filename)
				222
				223	return
				224
				225
				226	def do_bench(args, config):
				227	# Ceph Benchmark using multiple pods
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	228	# if only cleanup needed do it and exit
				229	_cleanup_only = args_utils.get_arg(args, 'cleanup_only')
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	230	_report_only = args_utils.get_arg(args, 'report_only')
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	231	config.resource_prefix = "cfgagent"
				232	if _cleanup_only:
				233	# Do forced resource cleanup and exit
				234	config.bench_mode = "cleanup"
				235	config.bench_agent_count = -1
				236	ceph_bench = bench.KubeCephBench(config)
				237	logger_cli.info(
				238	"# Discovering benchmark resources using prefix of '{}'".format(
				239	config.resource_prefix
				240	)
				241	)
				242	ceph_bench.prepare_cleanup()
				243	ceph_bench.cleanup()
				244	return
				245
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	246	# dump results options
				247	_dump_path = args_utils.get_arg(args, "dump_path")
				248	if _dump_path:
				249	logger_cli.info("# Results will be dumped to '{}'".format(_dump_path))
				250	config.bench_results_dump_path = _dump_path
				251	else:
				252	_p = "/tmp"
				253	logger_cli.info(
				254	"# No result dump path set. Defaulting to {}"
				255	"Consider setting it if running long task_file "
				256	"based test runs".format(_p)
				257	)
				258	config.bench_results_dump_path = _p
				259
				260	# Report filename
				261	_filename = args_utils.get_arg(args, 'html')
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	262	# gather Ceph info
				263	logger_cli.info("# Collecting Ceph cluster information")
				264	ceph_info = info.KubeCephInfo(config)
				265
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	266	# Task files or options
				267	_opts = get_fio_options()
				268	# Load name and announce it
				269	config.bench_name = args_utils.get_arg(args, "name")
				270	_opts["name"] = config.bench_name
				271	logger_cli.info(
				272	"# Using '{}' as ceph bench jobs name".format(_opts["name"])
				273	)
				274
				275	if _report_only:
				276	# Do forced report creation and exit
				277	config.bench_mode = "report"
				278	config.bench_agent_count = -1
				279	ceph_bench = bench.KubeCephBench(config)
				280	ceph_bench.set_ceph_info_class(ceph_info)
				281	logger_cli.info(
				282	"# Preparing to generate report '{}'".format(
				283	config.resource_prefix
				284	)
				285	)
				286	# Preload previous results for this name
				287	ceph_bench.preload_results()
				288	# Gather ceph data
				289	ceph_bench.wait_ceph_cooldown()
				290	# Generate report
				291	ceph_bench.create_report(_filename)
				292	return
				293
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	294	# Prepare the tasks and do synced testrun or a single one
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	295	logger_cli.info("# Initializing ceph benchmark module")
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	296	args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	297	# agents count option
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	298	config.bench_agent_count = args_utils.get_arg(args, "agents")
				299	logger_cli.info("-> using {} agents".format(config.bench_agent_count))
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	300	# Cleaning option
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	301	config.no_cleaning_after_benchmark = args_utils.get_arg(args, "no_cleanup")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	302	# storage class
				303	_storage_class = args_utils.get_arg(args, "storage_class")
				304	logger_cli.info("-> using storage class of '{}'".format(_storage_class))
				305	config.bench_storage_class = _storage_class
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	306	if _dump_path:
				307	logger_cli.info("# Results will be dumped to '{}'".format(_dump_path))
				308	config.bench_results_dump_path = _dump_path
				309	else:
				310	logger_cli.info(
				311	"# No result dump path set. "
				312	"Consider setting it if running long task_file based test runs"
				313	)
				314	config.bench_results_dump_path = _dump_path
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	315
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	316	_task_file = args_utils.get_arg(args, "task_file", nofail=True)
				317	if not _task_file:
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	318	logger_cli.info("-> Running single benchmark run")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	319	config.bench_mode = "single"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	320	# Updating _opts from arguments
				321	_params = [
				322	"bs",
				323	"iodepth",
				324	"size",
				325	"readwrite",
				326	"ramp_time",
				327	"runtime",
				328	"ioengine"
				329	]
				330	for _p in _params:
				331	_opts[_p] = _get_param_and_log(args, _p)
				332	if _opts["readwrite"] in seq_modes:
				333	_p = "offset_increment"
				334	_opts[_p] = _get_param_and_log(args, _p)
				335	elif _opts["readwrite"] in mix_modes:
				336	_p = "rwmixread"
				337	_opts[_p] = _get_param_and_log(args, _p)
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	338	else:
				339	logger_cli.info("-> running with tasks from '{}'".format(_task_file))
				340	config.bench_task_file = _task_file
				341	config.bench_mode = "tasks"
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	342	logger_cli.debug("... default/selected options for fio:")
				343	for _k in _opts.keys():
				344	# TODO: Update options for single run
				345	logger_cli.debug(" {} = {}".format(_k, _opts[_k]))
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	346
Alex	3034ba5	2021-11-13 17:06:45 -0600	[diff] [blame]	347	# init the Bench class
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	348	ceph_bench = bench.KubeCephBench(config)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	349	ceph_bench.set_ceph_info_class(ceph_info)
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	350	# Preload previous results for this name
				351	ceph_bench.preload_results()
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	352	# Do the testrun
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	353	ceph_bench.prepare_agents(_opts)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	354	ceph_bench.wait_ceph_cooldown()
				355
				356	# DEBUG of report in progress
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	357	if not ceph_bench.run_benchmark(_opts):
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	358	# No cleaning and/or report if benchmark was not finished
Alex	bfa947c	2021-11-11 18:14:28 -0600	[diff] [blame]	359	logger_cli.info("# Abnormal benchmark run, no cleaning performed")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	360	return
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	361	# Remove after DEBUG
				362	# ceph_bench.collect_results(_opts)
				363	# END DEBUG
				364
Alex	3034ba5	2021-11-13 17:06:45 -0600	[diff] [blame]	365	# Cleaning
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	366	if not config.no_cleaning_after_benchmark:
				367	ceph_bench.cleanup()
Alex	bfa947c	2021-11-11 18:14:28 -0600	[diff] [blame]	368	else:
				369	logger_cli.info(
				370	"# '--no-cleaning' option set. Cleaning not conducted."
				371	)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	372
				373	# Create report
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	374	ceph_bench.create_report(_filename)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	375
				376	return