Blame - cfg_checker/modules/ceph/__init__.py - mcp/cfg-checker

blob: dd483cf4ad812085593631d7c1ed61bbf00d2baf [file] [log] [blame]

Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	1	from cfg_checker.agent.fio_runner import get_fio_options
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame^]	2	from cfg_checker.agent.fio_runner import seq_modes, mix_modes
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	3	from cfg_checker.common import logger_cli
				4	from cfg_checker.common.settings import ENV_TYPE_KUBE
				5	from cfg_checker.helpers import args_utils
				6	from cfg_checker.modules.ceph import info, bench
				7
				8	command_help = "Ceph Storage information and benchmarks"
				9	supported_envs = [ENV_TYPE_KUBE]
				10
				11
				12	# def _selectClass(_env, strClassHint="checker"):
				13	# _class = None
				14	# if _env == ENV_TYPE_SALT:
				15	# if strClassHint == "info":
				16	# _class = info.SaltCephInfo
				17	# elif strClassHint == "bench":
				18	# _class = bench.SaltCephInfo
				19	# elif _env == ENV_TYPE_KUBE:
				20	# if strClassHint == "info":
				21	# _class = info.KubeCephInfo
				22	# elif strClassHint == "bench":
				23	# _class = bench.KubeCephBench
				24	# if not _class:
				25	# raise CheckerException(
				26	# "Unknown hint for selecting Ceph handler Class: '{}'".format(
				27	# strClassHint
				28	# )
				29	# )
				30	# else:
				31	# return _class
				32
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame^]	33	def _get_param_and_log(arg, param_str):
				34	_value = args_utils.get_arg(arg, param_str)
				35	logger_cli.info(" {}={}".format(param_str, _value))
				36	return _value
				37
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	38
				39	def init_parser(_parser):
				40	# network subparser
				41	ceph_subparsers = _parser.add_subparsers(dest='type')
				42
				43	ceph_info_parser = ceph_subparsers.add_parser(
				44	'info',
				45	help="Gather Ceph Cluster information"
				46	)
				47
				48	ceph_info_parser.add_argument(
				49	'--detailed',
				50	action="store_true", default=False,
				51	help="Print additional details"
				52	)
				53
				54	ceph_info_parser.add_argument(
				55	'--tgz',
				56	metavar='ceph_tgz_filename',
				57	help="HTML filename to save report"
				58	)
				59
				60	ceph_report_parser = ceph_subparsers.add_parser(
				61	'report',
				62	help="Generate network check report"
				63	)
				64
				65	ceph_report_parser.add_argument(
				66	'--html',
				67	metavar='ceph_html_filename',
				68	help="HTML filename to save report"
				69	)
				70
				71	ceph_bench_parser = ceph_subparsers.add_parser(
				72	'bench',
				73	help="Run ceph benchmark"
				74	)
				75
				76	ceph_bench_parser.add_argument(
				77	'--task-list',
				78	metavar='ceph_tasks_filename',
				79	help="List file with data for Ceph bench testrun"
				80	)
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	81	ceph_bench_parser.add_argument(
				82	'--agents',
				83	type=int, metavar='agent_count', default=5,
				84	help="List file with data for Ceph bench testrun"
				85	)
				86	ceph_bench_parser.add_argument(
				87	'--html',
				88	metavar='ceph_html_filename',
				89	help="HTML filename to save report"
				90	)
				91	ceph_bench_parser.add_argument(
				92	'--storage-class',
				93	metavar='storage_class',
				94	help="Storage class to be used in benchmark"
				95	)
				96	ceph_bench_parser.add_argument(
				97	'--task-file',
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	98	metavar='task_file',
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	99	help="Task file for benchmark"
				100	)
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	101	ceph_bench_parser.add_argument(
				102	'--no-cleanup',
				103	action="store_true", default=False,
				104	help="Do not cleanup services, agents, pvc, and pv"
				105	)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	106	ceph_bench_parser.add_argument(
				107	'--cleanup-only',
				108	action="store_true", default=False,
				109	help="Cleanup resources related to benchmark"
				110	)
				111	ceph_bench_parser.add_argument(
				112	'--dump-path',
				113	metavar="dump_results", default="/tmp",
				114	help="Dump result after each test run to use them later"
				115	)
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame^]	116	ceph_bench_parser.add_argument(
				117	'--name',
				118	metavar="name", default="cephbench",
				119	help="Dump result after each test run to use them later"
				120	)
				121	ceph_bench_parser.add_argument(
				122	'--bs',
				123	metavar="blocksize", default="16k",
				124	help="Block size for single run"
				125	)
				126	ceph_bench_parser.add_argument(
				127	'--iodepth',
				128	metavar="iodepth", default="16",
				129	help="IO Depth for single run"
				130	)
				131	ceph_bench_parser.add_argument(
				132	'--size',
				133	metavar="size", default="10G",
				134	help="Persistent volume size (M, G)"
				135	)
				136	ceph_bench_parser.add_argument(
				137	'--readwrite',
				138	metavar="readwrite", default="randrw",
				139	help="Test mode for single run"
				140	)
				141	ceph_bench_parser.add_argument(
				142	'--rwmixread',
				143	metavar="rwmixread", default="50",
				144	help="Percent of read in randon mixed mode (randrw)"
				145	)
				146	ceph_bench_parser.add_argument(
				147	'--ramp-time',
				148	metavar="ramp_time", default="5s",
				149	help="Warmup time before test"
				150	)
				151	ceph_bench_parser.add_argument(
				152	'--runtime',
				153	metavar="runtime", default="60s",
				154	help="Time based test run longevity"
				155	)
				156	ceph_bench_parser.add_argument(
				157	'--ioengine',
				158	metavar="ioengine", default="libaio",
				159	help="IO Engine used by fio. See eng-help output in fio for list"
				160	)
				161	ceph_bench_parser.add_argument(
				162	'--offset-increment',
				163	metavar="offset_increment", default="500M",
				164	help="IO Engine used by fio. See eng-help output in fio for list"
				165	)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	166
				167	return _parser
				168
				169
				170	def do_info(args, config):
				171	# Ceph info
				172	# Gather ceph info and create an archive with data
				173	args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
				174	# check tgz
				175	_tgzfile = "ceph_info_archive.tgz" if not args.tgz else args.tgz
				176
				177	# _class = _selectClass(_env)
				178	ceph_info = info.KubeCephInfo(config)
				179
				180	logger_cli.info("# Collecting Ceph cluster information")
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	181	ceph_info.gather_info()
				182
				183	# Debug, enable if needed to debug report generation
				184	# without actuall data collecting each time
				185	# ceph_info.dump_info()
				186	# ceph_info.load_info()
				187	# end debug
				188
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	189	ceph_info.generate_archive(_tgzfile)
Alex	df9cc3a	2021-10-12 14:37:28 -0500	[diff] [blame]	190	ceph_info.print_summary()
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	191
				192	return
				193
				194
				195	def do_report(args, config):
				196	# Ceph Report
				197	# Gather ceph info and create HTML report with all of the data
				198	args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
				199	_filename = args_utils.get_arg(args, 'html')
				200	logger_cli.info("# Ceph cluster Configuration report")
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	201
				202	# _class = _selectClass(_env)
				203	ceph_info = info.KubeCephInfo(config)
				204	# Debug, enable if needed to debug report generation
				205	# without actuall data collecting each time
				206	# ceph_info.load_info()
				207	# end debug
				208	ceph_info.gather_info()
				209	ceph_info.get_transposed_latency_table()
				210	ceph_info.get_latest_health_readout()
				211	ceph_info.create_html_report(_filename)
				212
				213	return
				214
				215
				216	def do_bench(args, config):
				217	# Ceph Benchmark using multiple pods
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	218	# if only cleanup needed do it and exit
				219	_cleanup_only = args_utils.get_arg(args, 'cleanup_only')
				220	config.resource_prefix = "cfgagent"
				221	if _cleanup_only:
				222	# Do forced resource cleanup and exit
				223	config.bench_mode = "cleanup"
				224	config.bench_agent_count = -1
				225	ceph_bench = bench.KubeCephBench(config)
				226	logger_cli.info(
				227	"# Discovering benchmark resources using prefix of '{}'".format(
				228	config.resource_prefix
				229	)
				230	)
				231	ceph_bench.prepare_cleanup()
				232	ceph_bench.cleanup()
				233	return
				234
				235	# gather Ceph info
				236	logger_cli.info("# Collecting Ceph cluster information")
				237	ceph_info = info.KubeCephInfo(config)
				238
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	239	# Prepare the tasks and do synced testrun or a single one
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	240	logger_cli.info("# Initializing ceph benchmark module")
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	241	args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame^]	242	# Report filename
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	243	_filename = args_utils.get_arg(args, 'html')
				244	# agents count option
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	245	config.bench_agent_count = args_utils.get_arg(args, "agents")
				246	logger_cli.info("-> using {} agents".format(config.bench_agent_count))
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame^]	247	# Cleaning option
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	248	config.no_cleaning_after_benchmark = args_utils.get_arg(args, "no_cleanup")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	249	# storage class
				250	_storage_class = args_utils.get_arg(args, "storage_class")
				251	logger_cli.info("-> using storage class of '{}'".format(_storage_class))
				252	config.bench_storage_class = _storage_class
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	253	# dump results options
				254	_dump_path = args_utils.get_arg(args, "dump_path")
				255	if _dump_path:
				256	logger_cli.info("# Results will be dumped to '{}'".format(_dump_path))
				257	config.bench_results_dump_path = _dump_path
				258	else:
				259	logger_cli.info(
				260	"# No result dump path set. "
				261	"Consider setting it if running long task_file based test runs"
				262	)
				263	config.bench_results_dump_path = _dump_path
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	264	# Task files or options
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame^]	265	_opts = get_fio_options()
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	266	_task_file = args_utils.get_arg(args, "task_file", nofail=True)
				267	if not _task_file:
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame^]	268	logger_cli.info("-> Running single benchmark run")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	269	config.bench_mode = "single"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame^]	270	# Updating _opts from arguments
				271	_params = [
				272	"bs",
				273	"iodepth",
				274	"size",
				275	"readwrite",
				276	"ramp_time",
				277	"runtime",
				278	"ioengine"
				279	]
				280	for _p in _params:
				281	_opts[_p] = _get_param_and_log(args, _p)
				282	if _opts["readwrite"] in seq_modes:
				283	_p = "offset_increment"
				284	_opts[_p] = _get_param_and_log(args, _p)
				285	elif _opts["readwrite"] in mix_modes:
				286	_p = "rwmixread"
				287	_opts[_p] = _get_param_and_log(args, _p)
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	288	else:
				289	logger_cli.info("-> running with tasks from '{}'".format(_task_file))
				290	config.bench_task_file = _task_file
				291	config.bench_mode = "tasks"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame^]	292	config.bench_name = args_utils.get_arg(args, "name")
				293	_opts["name"] = config.bench_name
				294	logger_cli.info(
				295	"# Using '{}' as ceph bench jobs name".format(_opts["name"])
				296	)
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	297	logger_cli.debug("... default/selected options for fio:")
				298	for _k in _opts.keys():
				299	# TODO: Update options for single run
				300	logger_cli.debug(" {} = {}".format(_k, _opts[_k]))
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	301
Alex	3034ba5	2021-11-13 17:06:45 -0600	[diff] [blame]	302	# handle option inavailability from command line for single mode
				303
				304	# init the Bench class
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	305	ceph_bench = bench.KubeCephBench(config)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	306	ceph_bench.set_ceph_info_class(ceph_info)
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame^]	307	# Preload previous results for this name
				308	ceph_bench.preload_results()
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	309	# Do the testrun
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	310	ceph_bench.prepare_agents(_opts)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	311	ceph_bench.wait_ceph_cooldown()
				312
				313	# DEBUG of report in progress
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	314	if not ceph_bench.run_benchmark(_opts):
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	315	# No cleaning and/or report if benchmark was not finished
Alex	bfa947c	2021-11-11 18:14:28 -0600	[diff] [blame]	316	logger_cli.info("# Abnormal benchmark run, no cleaning performed")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	317	return
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	318	# Remove after DEBUG
				319	# ceph_bench.collect_results(_opts)
				320	# END DEBUG
				321
Alex	3034ba5	2021-11-13 17:06:45 -0600	[diff] [blame]	322	# Cleaning
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	323	if not config.no_cleaning_after_benchmark:
				324	ceph_bench.cleanup()
Alex	bfa947c	2021-11-11 18:14:28 -0600	[diff] [blame]	325	else:
				326	logger_cli.info(
				327	"# '--no-cleaning' option set. Cleaning not conducted."
				328	)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	329
				330	# Create report
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	331	ceph_bench.create_report(_filename)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	332
				333	return