Blame - cfg_checker/modules/ceph/__init__.py - mcp/cfg-checker

blob: e2f00491bfa838a52d60435ceafe0cc74c0dc4a3 [file] [log] [blame]

Alex	0989ecf	2022-03-29 13:43:21 -0500	[diff] [blame]	1	# Author: Alex Savatieiev (osavatieiev@mirantis.com; a.savex@gmail.com)
				2	# Copyright 2019-2022 Mirantis, Inc.
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	3	from datetime import datetime
				4
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	5	from cfg_checker.agent.fio_runner import get_fio_options
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	6	from cfg_checker.agent.fio_runner import seq_modes, mix_modes
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	7	from cfg_checker.common import logger_cli
				8	from cfg_checker.common.settings import ENV_TYPE_KUBE
				9	from cfg_checker.helpers import args_utils
				10	from cfg_checker.modules.ceph import info, bench
				11
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	12
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	13	command_help = "Ceph Storage information and benchmarks"
				14	supported_envs = [ENV_TYPE_KUBE]
				15
				16
				17	# def _selectClass(_env, strClassHint="checker"):
				18	# _class = None
				19	# if _env == ENV_TYPE_SALT:
				20	# if strClassHint == "info":
				21	# _class = info.SaltCephInfo
				22	# elif strClassHint == "bench":
				23	# _class = bench.SaltCephInfo
				24	# elif _env == ENV_TYPE_KUBE:
				25	# if strClassHint == "info":
				26	# _class = info.KubeCephInfo
				27	# elif strClassHint == "bench":
				28	# _class = bench.KubeCephBench
				29	# if not _class:
				30	# raise CheckerException(
				31	# "Unknown hint for selecting Ceph handler Class: '{}'".format(
				32	# strClassHint
				33	# )
				34	# )
				35	# else:
				36	# return _class
				37
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	38	def _get_param_and_log(arg, param_str):
				39	_value = args_utils.get_arg(arg, param_str)
				40	logger_cli.info(" {}={}".format(param_str, _value))
				41	return _value
				42
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	43
				44	def init_parser(_parser):
				45	# network subparser
				46	ceph_subparsers = _parser.add_subparsers(dest='type')
				47
				48	ceph_info_parser = ceph_subparsers.add_parser(
				49	'info',
				50	help="Gather Ceph Cluster information"
				51	)
				52
				53	ceph_info_parser.add_argument(
				54	'--detailed',
				55	action="store_true", default=False,
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	56	help="Print additional details. (Not implemented yet)"
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	57	)
				58
				59	ceph_info_parser.add_argument(
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	60	'--client-name',
				61	metavar='client_name',
				62	help="Client name for archive naming"
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	63	)
				64
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	65	ceph_info_parser.add_argument(
				66	'--project-name',
				67	metavar='projectname',
				68	help="Project name for archive naming"
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	69	)
				70
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	71	ceph_info_parser.add_argument(
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	72	'--html',
				73	metavar='ceph_html_filename',
				74	help="HTML filename to save report"
				75	)
				76
				77	ceph_bench_parser = ceph_subparsers.add_parser(
				78	'bench',
				79	help="Run ceph benchmark"
				80	)
				81
				82	ceph_bench_parser.add_argument(
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	83	'--agents',
				84	type=int, metavar='agent_count', default=5,
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	85	help="Number of agents to use in all test runs. Default: 5"
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	86	)
				87	ceph_bench_parser.add_argument(
				88	'--html',
				89	metavar='ceph_html_filename',
				90	help="HTML filename to save report"
				91	)
				92	ceph_bench_parser.add_argument(
				93	'--storage-class',
				94	metavar='storage_class',
				95	help="Storage class to be used in benchmark"
				96	)
				97	ceph_bench_parser.add_argument(
				98	'--task-file',
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	99	metavar='task_file',
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	100	help="Task file for benchmark with parameters to use"
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	101	)
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	102	ceph_bench_parser.add_argument(
				103	'--no-cleanup',
				104	action="store_true", default=False,
				105	help="Do not cleanup services, agents, pvc, and pv"
				106	)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	107	ceph_bench_parser.add_argument(
				108	'--cleanup-only',
				109	action="store_true", default=False,
				110	help="Cleanup resources related to benchmark"
				111	)
				112	ceph_bench_parser.add_argument(
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	113	'--report-only',
				114	action="store_true", default=False,
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	115	help="Just create report using files in '--dump-path' folder"
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	116	)
				117	ceph_bench_parser.add_argument(
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	118	'--dump-path',
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	119	metavar="dump_results",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	120	help="Dump result after each test run to use them later. "
				121	"Default: '/tmp'"
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	122	)
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	123	ceph_bench_parser.add_argument(
				124	'--name',
				125	metavar="name", default="cephbench",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	126	help="Job name to use for running fio. "
				127	"Can be used to grep results. Default: 'cephbench'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	128	)
				129	ceph_bench_parser.add_argument(
				130	'--bs',
				131	metavar="blocksize", default="16k",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	132	help="Block size for single run. Default: '16k'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	133	)
				134	ceph_bench_parser.add_argument(
				135	'--iodepth',
				136	metavar="iodepth", default="16",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	137	help="IO Depth for single run. Default: '16'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	138	)
				139	ceph_bench_parser.add_argument(
				140	'--size',
				141	metavar="size", default="10G",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	142	help="Persistent volume size (M, G). Default: '10G'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	143	)
				144	ceph_bench_parser.add_argument(
				145	'--readwrite',
				146	metavar="readwrite", default="randrw",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	147	help="Test mode for single run (read, write, randrw, "
				148	"randread, randwrite). Default: 'randrw'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	149	)
				150	ceph_bench_parser.add_argument(
				151	'--rwmixread',
				152	metavar="rwmixread", default="50",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	153	help="Percent of read in random mixed mode (randrw). Default: '50'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	154	)
				155	ceph_bench_parser.add_argument(
				156	'--ramp-time',
				157	metavar="ramp_time", default="5s",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	158	help="Warmup time before test. Default: '5s'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	159	)
				160	ceph_bench_parser.add_argument(
				161	'--runtime',
				162	metavar="runtime", default="60s",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	163	help="How long to run test. Default: '60s'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	164	)
				165	ceph_bench_parser.add_argument(
				166	'--ioengine',
				167	metavar="ioengine", default="libaio",
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	168	help="IO Engine used by fio. See 'fio eng-help' output for list. "
				169	"Default: 'libaio'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	170	)
				171	ceph_bench_parser.add_argument(
				172	'--offset-increment',
				173	metavar="offset_increment", default="500M",
Alex	41dd0cc	2022-02-09 17:33:23 -0600	[diff] [blame]	174	help="Offset to be used in 'read' and 'write' modes if multiple jobs "
				175	"used"
Alex	30a0064	2021-12-30 14:20:48 -0600	[diff] [blame]	176	"Default: '500M'"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	177	)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	178
				179	return _parser
				180
				181
				182	def do_info(args, config):
				183	# Ceph info
				184	# Gather ceph info and create an archive with data
				185	args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	186
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	187	# check client and project names
				188	if not args.client_name or not args.project_name:
				189	logger_cli.error(
				190	"ERROR: Missing '--client-name' or '--project-name' options"
				191	)
				192	return
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	193	# _class = _selectClass(_env)
				194	ceph_info = info.KubeCephInfo(config)
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	195	_tgzfilename = ceph_info.get_info_archive_filename(
				196	args.client_name,
				197	args.project_name
				198	)
				199	logger_cli.info("# Archive will be generated to '{}'".format(_tgzfilename))
				200	# get html
				201	_htmlfilename = args_utils.get_arg(args, 'html')
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	202
				203	logger_cli.info("# Collecting Ceph cluster information")
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	204	ceph_info.gather_info()
Alex	41dd0cc	2022-02-09 17:33:23 -0600	[diff] [blame]	205	ceph_info.gather_osd_configs()
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	206
				207	# Debug, enable if needed to debug report generation
				208	# without actuall data collecting each time
				209	# ceph_info.dump_info()
				210	# ceph_info.load_info()
				211	# end debug
				212
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	213	ceph_info.get_transposed_latency_table()
				214	ceph_info.get_latest_health_readout()
Alex	eb934de	2022-10-06 13:49:30 -0500	[diff] [blame]	215	ceph_info.create_html_report(_htmlfilename)
				216
				217	# handle cli part
				218	ceph_info.generate_archive(_tgzfilename)
				219	ceph_info.print_summary()
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	220
				221	return
				222
				223
				224	def do_bench(args, config):
				225	# Ceph Benchmark using multiple pods
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	226	# if only cleanup needed do it and exit
				227	_cleanup_only = args_utils.get_arg(args, 'cleanup_only')
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	228	_report_only = args_utils.get_arg(args, 'report_only')
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	229	config.resource_prefix = "cfgagent"
				230	if _cleanup_only:
				231	# Do forced resource cleanup and exit
				232	config.bench_mode = "cleanup"
				233	config.bench_agent_count = -1
				234	ceph_bench = bench.KubeCephBench(config)
				235	logger_cli.info(
				236	"# Discovering benchmark resources using prefix of '{}'".format(
				237	config.resource_prefix
				238	)
				239	)
				240	ceph_bench.prepare_cleanup()
				241	ceph_bench.cleanup()
				242	return
				243
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	244	# dump results options
				245	_dump_path = args_utils.get_arg(args, "dump_path")
				246	if _dump_path:
				247	logger_cli.info("# Results will be dumped to '{}'".format(_dump_path))
				248	config.bench_results_dump_path = _dump_path
				249	else:
				250	_p = "/tmp"
				251	logger_cli.info(
				252	"# No result dump path set. Defaulting to {}"
				253	"Consider setting it if running long task_file "
				254	"based test runs".format(_p)
				255	)
				256	config.bench_results_dump_path = _p
				257
				258	# Report filename
				259	_filename = args_utils.get_arg(args, 'html')
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	260	# gather Ceph info
				261	logger_cli.info("# Collecting Ceph cluster information")
				262	ceph_info = info.KubeCephInfo(config)
				263
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	264	# Task files or options
				265	_opts = get_fio_options()
				266	# Load name and announce it
				267	config.bench_name = args_utils.get_arg(args, "name")
				268	_opts["name"] = config.bench_name
				269	logger_cli.info(
				270	"# Using '{}' as ceph bench jobs name".format(_opts["name"])
				271	)
				272
				273	if _report_only:
				274	# Do forced report creation and exit
				275	config.bench_mode = "report"
				276	config.bench_agent_count = -1
				277	ceph_bench = bench.KubeCephBench(config)
				278	ceph_bench.set_ceph_info_class(ceph_info)
				279	logger_cli.info(
				280	"# Preparing to generate report '{}'".format(
				281	config.resource_prefix
				282	)
				283	)
				284	# Preload previous results for this name
				285	ceph_bench.preload_results()
				286	# Gather ceph data
				287	ceph_bench.wait_ceph_cooldown()
				288	# Generate report
				289	ceph_bench.create_report(_filename)
				290	return
				291
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	292	# Prepare the tasks and do synced testrun or a single one
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	293	logger_cli.info("# Initializing ceph benchmark module")
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	294	args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	295	# agents count option
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	296	config.bench_agent_count = args_utils.get_arg(args, "agents")
				297	logger_cli.info("-> using {} agents".format(config.bench_agent_count))
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	298	# Cleaning option
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	299	config.no_cleaning_after_benchmark = args_utils.get_arg(args, "no_cleanup")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	300	# storage class
				301	_storage_class = args_utils.get_arg(args, "storage_class")
				302	logger_cli.info("-> using storage class of '{}'".format(_storage_class))
				303	config.bench_storage_class = _storage_class
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	304	if _dump_path:
				305	logger_cli.info("# Results will be dumped to '{}'".format(_dump_path))
				306	config.bench_results_dump_path = _dump_path
				307	else:
				308	logger_cli.info(
				309	"# No result dump path set. "
				310	"Consider setting it if running long task_file based test runs"
				311	)
				312	config.bench_results_dump_path = _dump_path
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame]	313
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	314	_task_file = args_utils.get_arg(args, "task_file", nofail=True)
				315	if not _task_file:
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	316	logger_cli.info("-> Running single benchmark run")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	317	config.bench_mode = "single"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	318	# Updating _opts from arguments
				319	_params = [
				320	"bs",
				321	"iodepth",
				322	"size",
				323	"readwrite",
				324	"ramp_time",
				325	"runtime",
				326	"ioengine"
				327	]
				328	for _p in _params:
				329	_opts[_p] = _get_param_and_log(args, _p)
				330	if _opts["readwrite"] in seq_modes:
				331	_p = "offset_increment"
				332	_opts[_p] = _get_param_and_log(args, _p)
				333	elif _opts["readwrite"] in mix_modes:
				334	_p = "rwmixread"
				335	_opts[_p] = _get_param_and_log(args, _p)
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	336	else:
				337	logger_cli.info("-> running with tasks from '{}'".format(_task_file))
				338	config.bench_task_file = _task_file
				339	config.bench_mode = "tasks"
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	340	logger_cli.debug("... default/selected options for fio:")
				341	for _k in _opts.keys():
				342	# TODO: Update options for single run
				343	logger_cli.debug(" {} = {}".format(_k, _opts[_k]))
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	344
Alex	3034ba5	2021-11-13 17:06:45 -0600	[diff] [blame]	345	# init the Bench class
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	346	ceph_bench = bench.KubeCephBench(config)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	347	ceph_bench.set_ceph_info_class(ceph_info)
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	348	# Preload previous results for this name
				349	ceph_bench.preload_results()
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	350	# Do the testrun
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	351	ceph_bench.prepare_agents(_opts)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	352	ceph_bench.wait_ceph_cooldown()
				353
				354	# DEBUG of report in progress
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	355	if not ceph_bench.run_benchmark(_opts):
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	356	# No cleaning and/or report if benchmark was not finished
Alex	bfa947c	2021-11-11 18:14:28 -0600	[diff] [blame]	357	logger_cli.info("# Abnormal benchmark run, no cleaning performed")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	358	return
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	359	# Remove after DEBUG
				360	# ceph_bench.collect_results(_opts)
				361	# END DEBUG
				362
Alex	3034ba5	2021-11-13 17:06:45 -0600	[diff] [blame]	363	# Cleaning
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	364	if not config.no_cleaning_after_benchmark:
				365	ceph_bench.cleanup()
Alex	bfa947c	2021-11-11 18:14:28 -0600	[diff] [blame]	366	else:
				367	logger_cli.info(
				368	"# '--no-cleaning' option set. Cleaning not conducted."
				369	)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	370
				371	# Create report
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	372	ceph_bench.create_report(_filename)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	373
				374	return