Blame - cfg_checker/modules/ceph/__init__.py - mcp/cfg-checker

blob: 5c9357b5441a7d6664fe20acc337a67c3c9b1ca8 [file] [log] [blame]

Alex	0989ecf	2022-03-29 13:43:21 -0500	[diff] [blame]	1	# Author: Alex Savatieiev (osavatieiev@mirantis.com; a.savex@gmail.com)
				2	# Copyright 2019-2022 Mirantis, Inc.
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	3	from cfg_checker.agent.fio_runner import get_fio_options
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	4	from cfg_checker.agent.fio_runner import seq_modes, mix_modes
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	5	from cfg_checker.common import logger_cli
				6	from cfg_checker.common.settings import ENV_TYPE_KUBE
				7	from cfg_checker.helpers import args_utils
				8	from cfg_checker.modules.ceph import info, bench
				9
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	10
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	11	command_help = "Ceph Storage information and benchmarks"
				12	supported_envs = [ENV_TYPE_KUBE]
				13
				14
				15	# def _selectClass(_env, strClassHint="checker"):
				16	# _class = None
				17	# if _env == ENV_TYPE_SALT:
				18	# if strClassHint == "info":
				19	# _class = info.SaltCephInfo
				20	# elif strClassHint == "bench":
				21	# _class = bench.SaltCephInfo
				22	# elif _env == ENV_TYPE_KUBE:
				23	# if strClassHint == "info":
				24	# _class = info.KubeCephInfo
				25	# elif strClassHint == "bench":
				26	# _class = bench.KubeCephBench
				27	# if not _class:
				28	# raise CheckerException(
				29	# "Unknown hint for selecting Ceph handler Class: '{}'".format(
				30	# strClassHint
				31	# )
				32	# )
				33	# else:
				34	# return _class
				35
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	36	def _get_param_and_log(arg, param_str):
				37	_value = args_utils.get_arg(arg, param_str)
				38	logger_cli.info(" {}={}".format(param_str, _value))
				39	return _value
				40
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	41
				42	def init_parser(_parser):
				43	# network subparser
				44	ceph_subparsers = _parser.add_subparsers(dest='type')
				45
				46	ceph_info_parser = ceph_subparsers.add_parser(
				47	'info',
				48	help="Gather Ceph Cluster information"
				49	)
				50
				51	ceph_info_parser.add_argument(
				52	'--detailed',
				53	action="store_true", default=False,
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	54	help="Print additional details. (Not implemented yet)"
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	55	)
				56
				57	ceph_info_parser.add_argument(
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	58	'--client-name',
				59	metavar='client_name',
				60	help="Client name for archive naming"
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	61	)
				62
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	63	ceph_info_parser.add_argument(
				64	'--project-name',
				65	metavar='projectname',
				66	help="Project name for archive naming"
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	67	)
				68
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	69	ceph_info_parser.add_argument(
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	70	'--html',
				71	metavar='ceph_html_filename',
				72	help="HTML filename to save report"
				73	)
				74
				75	ceph_bench_parser = ceph_subparsers.add_parser(
				76	'bench',
				77	help="Run ceph benchmark"
				78	)
				79
				80	ceph_bench_parser.add_argument(
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	81	'--agents',
				82	type=int, metavar='agent_count', default=5,
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	83	help="Number of agents to use in all test runs. Default: 5"
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	84	)
				85	ceph_bench_parser.add_argument(
				86	'--html',
				87	metavar='ceph_html_filename',
				88	help="HTML filename to save report"
				89	)
				90	ceph_bench_parser.add_argument(
				91	'--storage-class',
				92	metavar='storage_class',
				93	help="Storage class to be used in benchmark"
				94	)
				95	ceph_bench_parser.add_argument(
				96	'--task-file',
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	97	metavar='task_file',
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	98	help="Task file for benchmark with parameters to use"
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	99	)
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	100	ceph_bench_parser.add_argument(
				101	'--no-cleanup',
				102	action="store_true", default=False,
				103	help="Do not cleanup services, agents, pvc, and pv"
				104	)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	105	ceph_bench_parser.add_argument(
				106	'--cleanup-only',
				107	action="store_true", default=False,
				108	help="Cleanup resources related to benchmark"
				109	)
				110	ceph_bench_parser.add_argument(
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	111	'--report-only',
				112	action="store_true", default=False,
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	113	help="Just create report using files in '--dump-path' folder"
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	114	)
				115	ceph_bench_parser.add_argument(
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	116	'--dump-path',
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	117	metavar="dump_results",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	118	help="Dump result after each test run to use them later. "
				119	"Default: '/tmp'"
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	120	)
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	121	ceph_bench_parser.add_argument(
				122	'--name',
				123	metavar="name", default="cephbench",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	124	help="Job name to use for running fio. "
				125	"Can be used to grep results. Default: 'cephbench'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	126	)
				127	ceph_bench_parser.add_argument(
				128	'--bs',
				129	metavar="blocksize", default="16k",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	130	help="Block size for single run. Default: '16k'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	131	)
				132	ceph_bench_parser.add_argument(
				133	'--iodepth',
				134	metavar="iodepth", default="16",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	135	help="IO Depth for single run. Default: '16'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	136	)
				137	ceph_bench_parser.add_argument(
				138	'--size',
				139	metavar="size", default="10G",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	140	help="Persistent volume size (M, G). Default: '10G'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	141	)
				142	ceph_bench_parser.add_argument(
				143	'--readwrite',
				144	metavar="readwrite", default="randrw",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	145	help="Test mode for single run (read, write, randrw, "
				146	"randread, randwrite). Default: 'randrw'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	147	)
				148	ceph_bench_parser.add_argument(
				149	'--rwmixread',
				150	metavar="rwmixread", default="50",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	151	help="Percent of read in random mixed mode (randrw). Default: '50'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	152	)
				153	ceph_bench_parser.add_argument(
				154	'--ramp-time',
				155	metavar="ramp_time", default="5s",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	156	help="Warmup time before test. Default: '5s'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	157	)
				158	ceph_bench_parser.add_argument(
				159	'--runtime',
				160	metavar="runtime", default="60s",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	161	help="How long to run test. Default: '60s'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	162	)
				163	ceph_bench_parser.add_argument(
				164	'--ioengine',
				165	metavar="ioengine", default="libaio",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	166	help="IO Engine used by fio. See 'fio eng-help' output for list. "
				167	"Default: 'libaio'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	168	)
				169	ceph_bench_parser.add_argument(
				170	'--offset-increment',
				171	metavar="offset_increment", default="500M",
Alex	41dd0cc	2022-02-09 17:33:23 -0600	[diff] [blame]	172	help="Offset to be used in 'read' and 'write' modes if multiple jobs "
				173	"used"
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	174	"Default: '500M'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	175	)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	176
				177	return _parser
				178
				179
				180	def do_info(args, config):
				181	# Ceph info
				182	# Gather ceph info and create an archive with data
				183	args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	184
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	185	# check client and project names
				186	if not args.client_name or not args.project_name:
				187	logger_cli.error(
				188	"ERROR: Missing '--client-name' or '--project-name' options"
				189	)
				190	return
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	191	# _class = _selectClass(_env)
				192	ceph_info = info.KubeCephInfo(config)
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	193	_tgzfilename = ceph_info.get_info_archive_filename(
				194	args.client_name,
				195	args.project_name
				196	)
				197	logger_cli.info("# Archive will be generated to '{}'".format(_tgzfilename))
				198	# get html
				199	_htmlfilename = args_utils.get_arg(args, 'html')
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	200
				201	logger_cli.info("# Collecting Ceph cluster information")
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	202	ceph_info.gather_info()
Alex	41dd0cc	2022-02-09 17:33:23 -0600	[diff] [blame]	203	ceph_info.gather_osd_configs()
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	204
				205	# Debug, enable if needed to debug report generation
				206	# without actuall data collecting each time
				207	# ceph_info.dump_info()
				208	# ceph_info.load_info()
				209	# end debug
				210
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	211	ceph_info.get_transposed_latency_table()
				212	ceph_info.get_latest_health_readout()
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	213	ceph_info.create_html_report(_htmlfilename)
				214
				215	# handle cli part
				216	ceph_info.generate_archive(_tgzfilename)
				217	ceph_info.print_summary()
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	218
				219	return
				220
				221
				222	def do_bench(args, config):
				223	# Ceph Benchmark using multiple pods
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	224	# if only cleanup needed do it and exit
				225	_cleanup_only = args_utils.get_arg(args, 'cleanup_only')
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	226	_report_only = args_utils.get_arg(args, 'report_only')
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	227	config.resource_prefix = "cfgagent"
				228	if _cleanup_only:
				229	# Do forced resource cleanup and exit
				230	config.bench_mode = "cleanup"
				231	config.bench_agent_count = -1
				232	ceph_bench = bench.KubeCephBench(config)
				233	logger_cli.info(
				234	"# Discovering benchmark resources using prefix of '{}'".format(
				235	config.resource_prefix
				236	)
				237	)
				238	ceph_bench.prepare_cleanup()
				239	ceph_bench.cleanup()
				240	return
				241
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	242	# dump results options
				243	_dump_path = args_utils.get_arg(args, "dump_path")
				244	if _dump_path:
				245	logger_cli.info("# Results will be dumped to '{}'".format(_dump_path))
				246	config.bench_results_dump_path = _dump_path
				247	else:
				248	_p = "/tmp"
				249	logger_cli.info(
				250	"# No result dump path set. Defaulting to {}"
				251	"Consider setting it if running long task_file "
				252	"based test runs".format(_p)
				253	)
				254	config.bench_results_dump_path = _p
				255
				256	# Report filename
				257	_filename = args_utils.get_arg(args, 'html')
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	258	# gather Ceph info
				259	logger_cli.info("# Collecting Ceph cluster information")
				260	ceph_info = info.KubeCephInfo(config)
				261
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	262	# Task files or options
				263	_opts = get_fio_options()
				264	# Load name and announce it
				265	config.bench_name = args_utils.get_arg(args, "name")
				266	_opts["name"] = config.bench_name
				267	logger_cli.info(
				268	"# Using '{}' as ceph bench jobs name".format(_opts["name"])
				269	)
				270
				271	if _report_only:
				272	# Do forced report creation and exit
				273	config.bench_mode = "report"
				274	config.bench_agent_count = -1
				275	ceph_bench = bench.KubeCephBench(config)
				276	ceph_bench.set_ceph_info_class(ceph_info)
				277	logger_cli.info(
				278	"# Preparing to generate report '{}'".format(
				279	config.resource_prefix
				280	)
				281	)
				282	# Preload previous results for this name
				283	ceph_bench.preload_results()
				284	# Gather ceph data
				285	ceph_bench.wait_ceph_cooldown()
				286	# Generate report
				287	ceph_bench.create_report(_filename)
				288	return
				289
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	290	# Prepare the tasks and do synced testrun or a single one
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	291	logger_cli.info("# Initializing ceph benchmark module")
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	292	args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	293	# agents count option
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	294	config.bench_agent_count = args_utils.get_arg(args, "agents")
				295	logger_cli.info("-> using {} agents".format(config.bench_agent_count))
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	296	# Cleaning option
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	297	config.no_cleaning_after_benchmark = args_utils.get_arg(args, "no_cleanup")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	298	# storage class
				299	_storage_class = args_utils.get_arg(args, "storage_class")
				300	logger_cli.info("-> using storage class of '{}'".format(_storage_class))
				301	config.bench_storage_class = _storage_class
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	302	if _dump_path:
				303	logger_cli.info("# Results will be dumped to '{}'".format(_dump_path))
				304	config.bench_results_dump_path = _dump_path
				305	else:
				306	logger_cli.info(
				307	"# No result dump path set. "
				308	"Consider setting it if running long task_file based test runs"
				309	)
				310	config.bench_results_dump_path = _dump_path
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	311
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	312	_task_file = args_utils.get_arg(args, "task_file", nofail=True)
				313	if not _task_file:
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	314	logger_cli.info("-> Running single benchmark run")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	315	config.bench_mode = "single"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	316	# Updating _opts from arguments
				317	_params = [
				318	"bs",
				319	"iodepth",
				320	"size",
				321	"readwrite",
				322	"ramp_time",
				323	"runtime",
				324	"ioengine"
				325	]
				326	for _p in _params:
				327	_opts[_p] = _get_param_and_log(args, _p)
				328	if _opts["readwrite"] in seq_modes:
				329	_p = "offset_increment"
				330	_opts[_p] = _get_param_and_log(args, _p)
				331	elif _opts["readwrite"] in mix_modes:
				332	_p = "rwmixread"
				333	_opts[_p] = _get_param_and_log(args, _p)
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	334	else:
				335	logger_cli.info("-> running with tasks from '{}'".format(_task_file))
				336	config.bench_task_file = _task_file
				337	config.bench_mode = "tasks"
Alex	e4de114	2022-11-04 19:26:03 -0500	[diff] [blame^]	338	# Add default size to options
				339	_opts["size"] = _get_param_and_log(args, "size")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	340	logger_cli.debug("... default/selected options for fio:")
				341	for _k in _opts.keys():
				342	# TODO: Update options for single run
				343	logger_cli.debug(" {} = {}".format(_k, _opts[_k]))
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	344
Alex	3034ba5	2021-11-13 17:06:45 -0600	[diff] [blame]	345	# init the Bench class
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	346	ceph_bench = bench.KubeCephBench(config)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	347	ceph_bench.set_ceph_info_class(ceph_info)
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	348	# Preload previous results for this name
				349	ceph_bench.preload_results()
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	350	# Do the testrun
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	351	ceph_bench.prepare_agents(_opts)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	352	ceph_bench.wait_ceph_cooldown()
				353
				354	# DEBUG of report in progress
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	355	if not ceph_bench.run_benchmark(_opts):
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	356	# No cleaning and/or report if benchmark was not finished
Alex	bfa947c	2021-11-11 18:14:28 -0600	[diff] [blame]	357	logger_cli.info("# Abnormal benchmark run, no cleaning performed")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	358	return
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	359	# Remove after DEBUG
				360	# ceph_bench.collect_results(_opts)
				361	# END DEBUG
				362
Alex	3034ba5	2021-11-13 17:06:45 -0600	[diff] [blame]	363	# Cleaning
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	364	if not config.no_cleaning_after_benchmark:
				365	ceph_bench.cleanup()
Alex	bfa947c	2021-11-11 18:14:28 -0600	[diff] [blame]	366	else:
				367	logger_cli.info(
				368	"# '--no-cleaning' option set. Cleaning not conducted."
				369	)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	370
				371	# Create report
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	372	ceph_bench.create_report(_filename)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	373
				374	return