Blame - cfg_checker/modules/ceph/__init__.py - mcp/cfg-checker

blob: eee01ce3a008b0a6c9e5cee3288067988fc3ef4d [file] [log] [blame]

Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	1	from cfg_checker.agent.fio_runner import get_fio_options
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	2	from cfg_checker.agent.fio_runner import seq_modes, mix_modes
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	3	from cfg_checker.common import logger_cli
				4	from cfg_checker.common.settings import ENV_TYPE_KUBE
				5	from cfg_checker.helpers import args_utils
				6	from cfg_checker.modules.ceph import info, bench
				7
				8	command_help = "Ceph Storage information and benchmarks"
				9	supported_envs = [ENV_TYPE_KUBE]
				10
				11
				12	# def _selectClass(_env, strClassHint="checker"):
				13	# _class = None
				14	# if _env == ENV_TYPE_SALT:
				15	# if strClassHint == "info":
				16	# _class = info.SaltCephInfo
				17	# elif strClassHint == "bench":
				18	# _class = bench.SaltCephInfo
				19	# elif _env == ENV_TYPE_KUBE:
				20	# if strClassHint == "info":
				21	# _class = info.KubeCephInfo
				22	# elif strClassHint == "bench":
				23	# _class = bench.KubeCephBench
				24	# if not _class:
				25	# raise CheckerException(
				26	# "Unknown hint for selecting Ceph handler Class: '{}'".format(
				27	# strClassHint
				28	# )
				29	# )
				30	# else:
				31	# return _class
				32
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	33	def _get_param_and_log(arg, param_str):
				34	_value = args_utils.get_arg(arg, param_str)
				35	logger_cli.info(" {}={}".format(param_str, _value))
				36	return _value
				37
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	38
				39	def init_parser(_parser):
				40	# network subparser
				41	ceph_subparsers = _parser.add_subparsers(dest='type')
				42
				43	ceph_info_parser = ceph_subparsers.add_parser(
				44	'info',
				45	help="Gather Ceph Cluster information"
				46	)
				47
				48	ceph_info_parser.add_argument(
				49	'--detailed',
				50	action="store_true", default=False,
				51	help="Print additional details"
				52	)
				53
				54	ceph_info_parser.add_argument(
				55	'--tgz',
				56	metavar='ceph_tgz_filename',
				57	help="HTML filename to save report"
				58	)
				59
				60	ceph_report_parser = ceph_subparsers.add_parser(
				61	'report',
				62	help="Generate network check report"
				63	)
				64
				65	ceph_report_parser.add_argument(
				66	'--html',
				67	metavar='ceph_html_filename',
				68	help="HTML filename to save report"
				69	)
				70
				71	ceph_bench_parser = ceph_subparsers.add_parser(
				72	'bench',
				73	help="Run ceph benchmark"
				74	)
				75
				76	ceph_bench_parser.add_argument(
				77	'--task-list',
				78	metavar='ceph_tasks_filename',
				79	help="List file with data for Ceph bench testrun"
				80	)
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	81	ceph_bench_parser.add_argument(
				82	'--agents',
				83	type=int, metavar='agent_count', default=5,
				84	help="List file with data for Ceph bench testrun"
				85	)
				86	ceph_bench_parser.add_argument(
				87	'--html',
				88	metavar='ceph_html_filename',
				89	help="HTML filename to save report"
				90	)
				91	ceph_bench_parser.add_argument(
				92	'--storage-class',
				93	metavar='storage_class',
				94	help="Storage class to be used in benchmark"
				95	)
				96	ceph_bench_parser.add_argument(
				97	'--task-file',
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	98	metavar='task_file',
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	99	help="Task file for benchmark"
				100	)
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	101	ceph_bench_parser.add_argument(
				102	'--no-cleanup',
				103	action="store_true", default=False,
				104	help="Do not cleanup services, agents, pvc, and pv"
				105	)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	106	ceph_bench_parser.add_argument(
				107	'--cleanup-only',
				108	action="store_true", default=False,
				109	help="Cleanup resources related to benchmark"
				110	)
				111	ceph_bench_parser.add_argument(
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame^]	112	'--report-only',
				113	action="store_true", default=False,
				114	help="Just create report using files in folder"
				115	)
				116	ceph_bench_parser.add_argument(
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	117	'--dump-path',
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame^]	118	metavar="dump_results",
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	119	help="Dump result after each test run to use them later"
				120	)
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	121	ceph_bench_parser.add_argument(
				122	'--name',
				123	metavar="name", default="cephbench",
				124	help="Dump result after each test run to use them later"
				125	)
				126	ceph_bench_parser.add_argument(
				127	'--bs',
				128	metavar="blocksize", default="16k",
				129	help="Block size for single run"
				130	)
				131	ceph_bench_parser.add_argument(
				132	'--iodepth',
				133	metavar="iodepth", default="16",
				134	help="IO Depth for single run"
				135	)
				136	ceph_bench_parser.add_argument(
				137	'--size',
				138	metavar="size", default="10G",
				139	help="Persistent volume size (M, G)"
				140	)
				141	ceph_bench_parser.add_argument(
				142	'--readwrite',
				143	metavar="readwrite", default="randrw",
				144	help="Test mode for single run"
				145	)
				146	ceph_bench_parser.add_argument(
				147	'--rwmixread',
				148	metavar="rwmixread", default="50",
				149	help="Percent of read in randon mixed mode (randrw)"
				150	)
				151	ceph_bench_parser.add_argument(
				152	'--ramp-time',
				153	metavar="ramp_time", default="5s",
				154	help="Warmup time before test"
				155	)
				156	ceph_bench_parser.add_argument(
				157	'--runtime',
				158	metavar="runtime", default="60s",
				159	help="Time based test run longevity"
				160	)
				161	ceph_bench_parser.add_argument(
				162	'--ioengine',
				163	metavar="ioengine", default="libaio",
				164	help="IO Engine used by fio. See eng-help output in fio for list"
				165	)
				166	ceph_bench_parser.add_argument(
				167	'--offset-increment',
				168	metavar="offset_increment", default="500M",
				169	help="IO Engine used by fio. See eng-help output in fio for list"
				170	)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	171
				172	return _parser
				173
				174
				175	def do_info(args, config):
				176	# Ceph info
				177	# Gather ceph info and create an archive with data
				178	args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
				179	# check tgz
				180	_tgzfile = "ceph_info_archive.tgz" if not args.tgz else args.tgz
				181
				182	# _class = _selectClass(_env)
				183	ceph_info = info.KubeCephInfo(config)
				184
				185	logger_cli.info("# Collecting Ceph cluster information")
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	186	ceph_info.gather_info()
				187
				188	# Debug, enable if needed to debug report generation
				189	# without actuall data collecting each time
				190	# ceph_info.dump_info()
				191	# ceph_info.load_info()
				192	# end debug
				193
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	194	ceph_info.generate_archive(_tgzfile)
Alex	df9cc3a	2021-10-12 14:37:28 -0500	[diff] [blame]	195	ceph_info.print_summary()
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	196
				197	return
				198
				199
				200	def do_report(args, config):
				201	# Ceph Report
				202	# Gather ceph info and create HTML report with all of the data
				203	args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
				204	_filename = args_utils.get_arg(args, 'html')
				205	logger_cli.info("# Ceph cluster Configuration report")
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	206
				207	# _class = _selectClass(_env)
				208	ceph_info = info.KubeCephInfo(config)
				209	# Debug, enable if needed to debug report generation
				210	# without actuall data collecting each time
				211	# ceph_info.load_info()
				212	# end debug
				213	ceph_info.gather_info()
				214	ceph_info.get_transposed_latency_table()
				215	ceph_info.get_latest_health_readout()
				216	ceph_info.create_html_report(_filename)
				217
				218	return
				219
				220
				221	def do_bench(args, config):
				222	# Ceph Benchmark using multiple pods
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	223	# if only cleanup needed do it and exit
				224	_cleanup_only = args_utils.get_arg(args, 'cleanup_only')
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame^]	225	_report_only = args_utils.get_arg(args, 'report_only')
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	226	config.resource_prefix = "cfgagent"
				227	if _cleanup_only:
				228	# Do forced resource cleanup and exit
				229	config.bench_mode = "cleanup"
				230	config.bench_agent_count = -1
				231	ceph_bench = bench.KubeCephBench(config)
				232	logger_cli.info(
				233	"# Discovering benchmark resources using prefix of '{}'".format(
				234	config.resource_prefix
				235	)
				236	)
				237	ceph_bench.prepare_cleanup()
				238	ceph_bench.cleanup()
				239	return
				240
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame^]	241	# dump results options
				242	_dump_path = args_utils.get_arg(args, "dump_path")
				243	if _dump_path:
				244	logger_cli.info("# Results will be dumped to '{}'".format(_dump_path))
				245	config.bench_results_dump_path = _dump_path
				246	else:
				247	_p = "/tmp"
				248	logger_cli.info(
				249	"# No result dump path set. Defaulting to {}"
				250	"Consider setting it if running long task_file "
				251	"based test runs".format(_p)
				252	)
				253	config.bench_results_dump_path = _p
				254
				255	# Report filename
				256	_filename = args_utils.get_arg(args, 'html')
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	257	# gather Ceph info
				258	logger_cli.info("# Collecting Ceph cluster information")
				259	ceph_info = info.KubeCephInfo(config)
				260
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame^]	261	# Task files or options
				262	_opts = get_fio_options()
				263	# Load name and announce it
				264	config.bench_name = args_utils.get_arg(args, "name")
				265	_opts["name"] = config.bench_name
				266	logger_cli.info(
				267	"# Using '{}' as ceph bench jobs name".format(_opts["name"])
				268	)
				269
				270	if _report_only:
				271	# Do forced report creation and exit
				272	config.bench_mode = "report"
				273	config.bench_agent_count = -1
				274	ceph_bench = bench.KubeCephBench(config)
				275	ceph_bench.set_ceph_info_class(ceph_info)
				276	logger_cli.info(
				277	"# Preparing to generate report '{}'".format(
				278	config.resource_prefix
				279	)
				280	)
				281	# Preload previous results for this name
				282	ceph_bench.preload_results()
				283	# Gather ceph data
				284	ceph_bench.wait_ceph_cooldown()
				285	# Generate report
				286	ceph_bench.create_report(_filename)
				287	return
				288
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	289	# Prepare the tasks and do synced testrun or a single one
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	290	logger_cli.info("# Initializing ceph benchmark module")
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	291	args_utils.check_supported_env(ENV_TYPE_KUBE, args, config)
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	292	# agents count option
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	293	config.bench_agent_count = args_utils.get_arg(args, "agents")
				294	logger_cli.info("-> using {} agents".format(config.bench_agent_count))
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	295	# Cleaning option
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	296	config.no_cleaning_after_benchmark = args_utils.get_arg(args, "no_cleanup")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	297	# storage class
				298	_storage_class = args_utils.get_arg(args, "storage_class")
				299	logger_cli.info("-> using storage class of '{}'".format(_storage_class))
				300	config.bench_storage_class = _storage_class
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	301	if _dump_path:
				302	logger_cli.info("# Results will be dumped to '{}'".format(_dump_path))
				303	config.bench_results_dump_path = _dump_path
				304	else:
				305	logger_cli.info(
				306	"# No result dump path set. "
				307	"Consider setting it if running long task_file based test runs"
				308	)
				309	config.bench_results_dump_path = _dump_path
Alex	30380a4	2021-12-20 16:11:20 -0600	[diff] [blame^]	310
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	311	_task_file = args_utils.get_arg(args, "task_file", nofail=True)
				312	if not _task_file:
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	313	logger_cli.info("-> Running single benchmark run")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	314	config.bench_mode = "single"
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	315	# Updating _opts from arguments
				316	_params = [
				317	"bs",
				318	"iodepth",
				319	"size",
				320	"readwrite",
				321	"ramp_time",
				322	"runtime",
				323	"ioengine"
				324	]
				325	for _p in _params:
				326	_opts[_p] = _get_param_and_log(args, _p)
				327	if _opts["readwrite"] in seq_modes:
				328	_p = "offset_increment"
				329	_opts[_p] = _get_param_and_log(args, _p)
				330	elif _opts["readwrite"] in mix_modes:
				331	_p = "rwmixread"
				332	_opts[_p] = _get_param_and_log(args, _p)
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	333	else:
				334	logger_cli.info("-> running with tasks from '{}'".format(_task_file))
				335	config.bench_task_file = _task_file
				336	config.bench_mode = "tasks"
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	337	logger_cli.debug("... default/selected options for fio:")
				338	for _k in _opts.keys():
				339	# TODO: Update options for single run
				340	logger_cli.debug(" {} = {}".format(_k, _opts[_k]))
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	341
Alex	3034ba5	2021-11-13 17:06:45 -0600	[diff] [blame]	342	# init the Bench class
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	343	ceph_bench = bench.KubeCephBench(config)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	344	ceph_bench.set_ceph_info_class(ceph_info)
Alex	90ac153	2021-12-09 11:13:14 -0600	[diff] [blame]	345	# Preload previous results for this name
				346	ceph_bench.preload_results()
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	347	# Do the testrun
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	348	ceph_bench.prepare_agents(_opts)
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	349	ceph_bench.wait_ceph_cooldown()
				350
				351	# DEBUG of report in progress
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	352	if not ceph_bench.run_benchmark(_opts):
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	353	# No cleaning and/or report if benchmark was not finished
Alex	bfa947c	2021-11-11 18:14:28 -0600	[diff] [blame]	354	logger_cli.info("# Abnormal benchmark run, no cleaning performed")
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	355	return
Alex	b212954	2021-11-23 15:49:42 -0600	[diff] [blame]	356	# Remove after DEBUG
				357	# ceph_bench.collect_results(_opts)
				358	# END DEBUG
				359
Alex	3034ba5	2021-11-13 17:06:45 -0600	[diff] [blame]	360	# Cleaning
Alex	2a7657c	2021-11-10 20:51:34 -0600	[diff] [blame]	361	if not config.no_cleaning_after_benchmark:
				362	ceph_bench.cleanup()
Alex	bfa947c	2021-11-11 18:14:28 -0600	[diff] [blame]	363	else:
				364	logger_cli.info(
				365	"# '--no-cleaning' option set. Cleaning not conducted."
				366	)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	367
				368	# Create report
Alex	5cace3b	2021-11-10 16:40:37 -0600	[diff] [blame]	369	ceph_bench.create_report(_filename)
Alex	dcb792f	2021-10-04 14:24:21 -0500	[diff] [blame]	370
				371	return