wally/run_test.py - mcp/cvp-wally - Gitiles

 import time
 import json
 import copy
 import logging
 from typing import List, Tuple, Optional, Union, cast

 from cephlib.wally_storage import WallyDB
 from cephlib.node import NodeInfo, IRPCNode, get_hw_info, get_sw_info, get_hostname
 from cephlib.ssh import parse_ssh_uri
 from cephlib.node_impl import setup_rpc, connect

 from . import utils
 from .config import ConfigBlock
 from .stage import Stage, StepOrder
 from .sensors import collect_sensors_data
 from .suits.all_suits import all_suits
 from .test_run_class import TestRun
 from .result_classes import SuiteConfig


 logger = logging.getLogger("wally")


 class ConnectStage(Stage):
     """Connect to nodes stage"""

     priority = StepOrder.CONNECT

     def run(self, ctx: TestRun) -> None:
         with ctx.get_pool() as pool:
             logger.info("Connecting to %s nodes", len(ctx.nodes_info))

             def connect_ext(node_info: NodeInfo) -> Tuple[bool, Union[IRPCNode, NodeInfo]]:
                 try:
                     ssh_node = connect(node_info, conn_timeout=ctx.config.connect_timeout)

                     return True, setup_rpc(ssh_node,
                                            ctx.rpc_code,
                                            ctx.default_rpc_plugins,
                                            log_level=ctx.config.rpc_log_level,
                                            sudo=True)
                 except Exception as exc:
                     logger.exception("During connect to %s: %s", node_info, exc)
                     return False, node_info

             failed_testnodes = []  # type: List[NodeInfo]
             failed_nodes = []  # type: List[NodeInfo]
             ctx.nodes = []

             for ok, node in pool.map(connect_ext, ctx.nodes_info.values()):
                 if not ok:
                     node = cast(NodeInfo, node)
                     if 'testnode' in node.roles:
                         failed_testnodes.append(node)
                     else:
                         failed_nodes.append(node)
                 else:
                     ctx.nodes.append(cast(IRPCNode, node))

             if failed_nodes:
                 msg = "Node(s) {} would be excluded - can't connect"
                 logger.warning(msg.format(", ".join(map(str, failed_nodes))))

             if failed_testnodes:
                 msg = "Can't start RPC on testnode(s) " + ",".join(map(str, failed_testnodes))
                 logger.error(msg)
                 raise utils.StopTestError(msg)

             if not failed_nodes:
                 logger.info("All nodes connected successfully")

             def get_time(node):
                 return node.conn.sys.time()

             t_start = time.time()
             tms = pool.map(get_time, ctx.nodes)
             t_end = time.time()

             for node, val in zip(ctx.nodes, tms):
                 delta = 0
                 if val > t_end:
                     delta = val - t_end
                 elif val < t_start:
                     delta = t_start - val

                 if delta > ctx.config.max_time_diff_ms:
                     msg = ("Too large time shift {}ms on node {}. Stopping test." +
                            " Fix time on cluster nodes and restart test, or change " +
                            "max_time_diff_ms(={}ms) setting in config").format(delta,
                                                                                str(node),
                                                                                ctx.config.max_time_diff_ms)
                     logger.error(msg)
                     raise utils.StopTestError(msg)

                 if delta > 1:
                     logger.warning("Node %s has time shift at least %s ms", node, int(delta))

     def cleanup(self, ctx: TestRun) -> None:
         if ctx.config.get("download_rpc_logs", False):
             logger.info("Killing all outstanding processes")
             for node in ctx.nodes:
                 node.conn.cli.killall()

             if ctx.ceph_master_node:
                 ctx.ceph_master_node.conn.cli.killall()

             logger.info("Downloading RPC servers logs")
             for node in ctx.nodes:
                 if node.rpc_log_file is not None:
                     nid = node.node_id
                     path = WallyDB.rpc_logs.format(node_id=nid)
                     node.conn.server.flush_logs()
                     log = node.get_file_content(node.rpc_log_file)
                     if path in ctx.storage:
                         ctx.storage.append_raw(log, path)
                     else:
                         ctx.storage.put_raw(log, path)
                     logger.debug("RPC log from node {} stored into storage::{}".format(nid, path))

         logger.info("Disconnecting")
         with ctx.get_pool() as pool:
             list(pool.map(lambda node: node.disconnect(stop=True),
                           ctx.nodes + ([ctx.ceph_master_node] if ctx.ceph_master_node else [])))


 class CollectInfoStage(Stage):
     """Collect node info"""

     priority = StepOrder.UPDATE_NODES_INFO
     config_block = 'collect_info'

     def run(self, ctx: TestRun) -> None:
         with ctx.get_pool() as pool:
             try:
                 # can't make next RPC request until finish with previous for same node
                 for node, hw_info in zip(ctx.nodes, pool.map(get_hw_info, ctx.nodes)):
                     node.info.hw_info = hw_info
                 for node, sw_info in zip(ctx.nodes, pool.map(get_sw_info, ctx.nodes)):
                     node.info.sw_info = sw_info
             except Exception as exc:
                 logger.exception("During collecting cluster info")
                 raise utils.StopTestError() from exc

             logger.debug("Collecting hostnames")
             hostnames = pool.map(get_hostname, ctx.nodes)
             for node, hostname in zip(ctx.nodes, hostnames):
                 node.info.hostname = hostname


 class ExplicitNodesStage(Stage):
     """add explicit nodes"""

     priority = StepOrder.DISCOVER
     config_block = 'nodes'

     def run(self, ctx: TestRun) -> None:
         if WallyDB.all_nodes in ctx.storage:
             logger.info("Skip explicid nodes filling, as all_nodes all ready in storage")
             return

         for url, roles in ctx.config.get('nodes', {}).raw().items():
             ctx.merge_node(parse_ssh_uri(url), set(role.strip() for role in roles.split(",")))
             logger.debug("Add node %s with roles %s", url, roles)


 class SleepStage(Stage):
     """Save nodes list to file"""

     priority = StepOrder.TEST
     config_block = 'sleep'

     def run(self, ctx: TestRun) -> None:
         logger.debug("Will sleep for %r seconds", ctx.config.sleep)
         stime = time.time()
         time.sleep(ctx.config.sleep)
         ctx.storage.put([int(stime), int(time.time())], 'idle')


 class PrepareNodes(Stage):
     priority = StepOrder.START_SENSORS - 1

     def __init__(self):
         Stage.__init__(self)
         self.nodeepscrub_updated = False
         self.noscrub_updated = False

     def run(self, ctx: TestRun) -> None:
         ceph_sett = ctx.config.get('ceph_settings', "").split()
         if ceph_sett:
             for node in ctx.nodes:
                 if "ceph-mon" in node.info.roles or "ceph-osd" in node.info.roles:
                     state = json.loads(node.run("ceph health --format json"))["summary"]["summary"]
                     if 'noscrub' in ceph_sett:
                         if 'noscrub' in state:
                             logger.debug("noscrub already set on cluster")
                         else:
                             logger.info("Applying noscrub settings to ceph cluster")
                             node.run("ceph osd set noscrub")
                             self.noscrub_updated = True

                     if 'nodeepscrub' in ceph_sett:
                         if 'nodeepscrub' in state:
                             logger.debug("noscrub already set on cluster")
                         else:
                             logger.info("Applying noscrub settings to ceph cluster")
                             node.run("ceph osd set noscrub")
                             self.nodeepscrub_updated = True
                     break

     def cleanup(self, ctx: TestRun) -> None:
         if self.nodeepscrub_updated or self.noscrub_updated:
             for node in ctx.nodes:
                 if "ceph-mon" in node.info.roles or "ceph-osd" in node.info.roles :
                     if self.noscrub_updated:
                         logger.info("Reverting noscrub setting for ceph cluster")
                         node.run("ceph osd unset noscrub")
                         self.noscrub_updated = False

                     if self.nodeepscrub_updated:
                         logger.info("Reverting noscrub setting for ceph cluster")
                         node.run("ceph osd unset nodeepscrub")
                         self.nodeepscrub_updated = False


 class RunTestsStage(Stage):

     priority = StepOrder.TEST
     config_block = 'tests'

     def run(self, ctx: TestRun) -> None:
         if ctx.config.no_tests:
             logger.info("Skiping tests, as 'no_tests' config settings is True")
             return

         for suite_idx, test_suite in enumerate(ctx.config.get('tests', [])):
             test_nodes = [node for node in ctx.nodes if 'testnode' in node.info.roles]

             if not test_nodes:
                 logger.error("No test nodes found")
                 raise utils.StopTestError()

             if len(test_suite) != 1:
                 logger.error("Test suite %s contain more than one test. Put each test in separated group", suite_idx)
                 raise utils.StopTestError()

             name, params = list(test_suite.items())[0]
             vm_count = params.get('node_limit', None)  # type: Optional[int]

             # select test nodes
             if vm_count is None:
                 curr_test_nodes = test_nodes
             else:
                 curr_test_nodes = test_nodes[:vm_count]

             if not curr_test_nodes:
                 logger.error("No nodes found for test, skipping it.")
                 continue

             if name not in all_suits:
                 logger.error("Test suite %r not found. Only suits [%s] available", name, ", ".join(all_suits))
                 raise utils.StopTestError()

             test_cls = all_suits[name]
             remote_dir = ctx.config.default_test_local_folder.format(name=name, uuid=ctx.config.run_uuid)
             suite = SuiteConfig(test_cls.name,
                                 params=params,
                                 run_uuid=ctx.config.run_uuid,
                                 nodes=test_nodes,
                                 remote_dir=remote_dir,
                                 idx=suite_idx,
                                 keep_raw_files=ctx.config.keep_raw_files)

             test_cls(storage=ctx.rstorage,
                      suite=suite,
                      on_tests_boundry=lambda before_test: collect_sensors_data(ctx, False, before_test)).run()

     @classmethod
     def validate_config(cls, cfg: ConfigBlock) -> None:
         pass


 class SaveNodesStage(Stage):
     """Save nodes list to file"""
     priority = StepOrder.UPDATE_NODES_INFO + 1

     def run(self, ctx: TestRun) -> None:
         infos = list(ctx.nodes_info.values())
         params = {node.node_id: node.params for node in infos}
         ninfos = [copy.copy(node) for node in infos]
         for node in ninfos:
             node.params = {"in file": WallyDB.nodes_params}
         ctx.storage.put_list(ninfos, WallyDB.all_nodes)
         ctx.storage.put_raw(json.dumps(params).encode('utf8'), WallyDB.nodes_params)


 class LoadStoredNodesStage(Stage):
     priority = StepOrder.DISCOVER

     def run(self, ctx: TestRun) -> None:
         if WallyDB.all_nodes in ctx.storage:
             if ctx.nodes_info:
                 logger.error("Internal error: Some nodes already stored in " +
                              "nodes_info before LoadStoredNodesStage stage")
                 raise utils.StopTestError()

             ctx.nodes_info = {node.node_id: node for node in ctx.rstorage.load_nodes()}
             logger.info("%s nodes loaded from database", len(ctx.nodes_info))
	import time
	import json
	import copy
	import logging
	from typing import List, Tuple, Optional, Union, cast

	from cephlib.wally_storage import WallyDB
	from cephlib.node import NodeInfo, IRPCNode, get_hw_info, get_sw_info, get_hostname
	from cephlib.ssh import parse_ssh_uri
	from cephlib.node_impl import setup_rpc, connect

	from . import utils
	from .config import ConfigBlock
	from .stage import Stage, StepOrder
	from .sensors import collect_sensors_data
	from .suits.all_suits import all_suits
	from .test_run_class import TestRun
	from .result_classes import SuiteConfig


	logger = logging.getLogger("wally")


	class ConnectStage(Stage):
	"""Connect to nodes stage"""

	priority = StepOrder.CONNECT

	def run(self, ctx: TestRun) -> None:
	with ctx.get_pool() as pool:
	logger.info("Connecting to %s nodes", len(ctx.nodes_info))

	def connect_ext(node_info: NodeInfo) -> Tuple[bool, Union[IRPCNode, NodeInfo]]:
	try:
	ssh_node = connect(node_info, conn_timeout=ctx.config.connect_timeout)

	return True, setup_rpc(ssh_node,
	ctx.rpc_code,
	ctx.default_rpc_plugins,
	log_level=ctx.config.rpc_log_level,
	sudo=True)
	except Exception as exc:
	logger.exception("During connect to %s: %s", node_info, exc)
	return False, node_info

	failed_testnodes = [] # type: List[NodeInfo]
	failed_nodes = [] # type: List[NodeInfo]
	ctx.nodes = []

	for ok, node in pool.map(connect_ext, ctx.nodes_info.values()):
	if not ok:
	node = cast(NodeInfo, node)
	if 'testnode' in node.roles:
	failed_testnodes.append(node)
	else:
	failed_nodes.append(node)
	else:
	ctx.nodes.append(cast(IRPCNode, node))

	if failed_nodes:
	msg = "Node(s) {} would be excluded - can't connect"
	logger.warning(msg.format(", ".join(map(str, failed_nodes))))

	if failed_testnodes:
	msg = "Can't start RPC on testnode(s) " + ",".join(map(str, failed_testnodes))
	logger.error(msg)
	raise utils.StopTestError(msg)

	if not failed_nodes:
	logger.info("All nodes connected successfully")

	def get_time(node):
	return node.conn.sys.time()

	t_start = time.time()
	tms = pool.map(get_time, ctx.nodes)
	t_end = time.time()

	for node, val in zip(ctx.nodes, tms):
	delta = 0
	if val > t_end:
	delta = val - t_end
	elif val < t_start:
	delta = t_start - val

	if delta > ctx.config.max_time_diff_ms:
	msg = ("Too large time shift {}ms on node {}. Stopping test." +
	" Fix time on cluster nodes and restart test, or change " +
	"max_time_diff_ms(={}ms) setting in config").format(delta,
	str(node),
	ctx.config.max_time_diff_ms)
	logger.error(msg)
	raise utils.StopTestError(msg)

	if delta > 1:
	logger.warning("Node %s has time shift at least %s ms", node, int(delta))

	def cleanup(self, ctx: TestRun) -> None:
	if ctx.config.get("download_rpc_logs", False):
	logger.info("Killing all outstanding processes")
	for node in ctx.nodes:
	node.conn.cli.killall()

	if ctx.ceph_master_node:
	ctx.ceph_master_node.conn.cli.killall()

	logger.info("Downloading RPC servers logs")
	for node in ctx.nodes:
	if node.rpc_log_file is not None:
	nid = node.node_id
	path = WallyDB.rpc_logs.format(node_id=nid)
	node.conn.server.flush_logs()
	log = node.get_file_content(node.rpc_log_file)
	if path in ctx.storage:
	ctx.storage.append_raw(log, path)
	else:
	ctx.storage.put_raw(log, path)
	logger.debug("RPC log from node {} stored into storage::{}".format(nid, path))

	logger.info("Disconnecting")
	with ctx.get_pool() as pool:
	list(pool.map(lambda node: node.disconnect(stop=True),
	ctx.nodes + ([ctx.ceph_master_node] if ctx.ceph_master_node else [])))


	class CollectInfoStage(Stage):
	"""Collect node info"""

	priority = StepOrder.UPDATE_NODES_INFO
	config_block = 'collect_info'

	def run(self, ctx: TestRun) -> None:
	with ctx.get_pool() as pool:
	try:
	# can't make next RPC request until finish with previous for same node
	for node, hw_info in zip(ctx.nodes, pool.map(get_hw_info, ctx.nodes)):
	node.info.hw_info = hw_info
	for node, sw_info in zip(ctx.nodes, pool.map(get_sw_info, ctx.nodes)):
	node.info.sw_info = sw_info
	except Exception as exc:
	logger.exception("During collecting cluster info")
	raise utils.StopTestError() from exc

	logger.debug("Collecting hostnames")
	hostnames = pool.map(get_hostname, ctx.nodes)
	for node, hostname in zip(ctx.nodes, hostnames):
	node.info.hostname = hostname


	class ExplicitNodesStage(Stage):
	"""add explicit nodes"""

	priority = StepOrder.DISCOVER
	config_block = 'nodes'

	def run(self, ctx: TestRun) -> None:
	if WallyDB.all_nodes in ctx.storage:
	logger.info("Skip explicid nodes filling, as all_nodes all ready in storage")
	return

	for url, roles in ctx.config.get('nodes', {}).raw().items():
	ctx.merge_node(parse_ssh_uri(url), set(role.strip() for role in roles.split(",")))
	logger.debug("Add node %s with roles %s", url, roles)


	class SleepStage(Stage):
	"""Save nodes list to file"""

	priority = StepOrder.TEST
	config_block = 'sleep'

	def run(self, ctx: TestRun) -> None:
	logger.debug("Will sleep for %r seconds", ctx.config.sleep)
	stime = time.time()
	time.sleep(ctx.config.sleep)
	ctx.storage.put([int(stime), int(time.time())], 'idle')


	class PrepareNodes(Stage):
	priority = StepOrder.START_SENSORS - 1

	def __init__(self):
	Stage.__init__(self)
	self.nodeepscrub_updated = False
	self.noscrub_updated = False

	def run(self, ctx: TestRun) -> None:
	ceph_sett = ctx.config.get('ceph_settings', "").split()
	if ceph_sett:
	for node in ctx.nodes:
	if "ceph-mon" in node.info.roles or "ceph-osd" in node.info.roles:
	state = json.loads(node.run("ceph health --format json"))["summary"]["summary"]
	if 'noscrub' in ceph_sett:
	if 'noscrub' in state:
	logger.debug("noscrub already set on cluster")
	else:
	logger.info("Applying noscrub settings to ceph cluster")
	node.run("ceph osd set noscrub")
	self.noscrub_updated = True

	if 'nodeepscrub' in ceph_sett:
	if 'nodeepscrub' in state:
	logger.debug("noscrub already set on cluster")
	else:
	logger.info("Applying noscrub settings to ceph cluster")
	node.run("ceph osd set noscrub")
	self.nodeepscrub_updated = True
	break

	def cleanup(self, ctx: TestRun) -> None:
	if self.nodeepscrub_updated or self.noscrub_updated:
	for node in ctx.nodes:
	if "ceph-mon" in node.info.roles or "ceph-osd" in node.info.roles :
	if self.noscrub_updated:
	logger.info("Reverting noscrub setting for ceph cluster")
	node.run("ceph osd unset noscrub")
	self.noscrub_updated = False

	if self.nodeepscrub_updated:
	logger.info("Reverting noscrub setting for ceph cluster")
	node.run("ceph osd unset nodeepscrub")
	self.nodeepscrub_updated = False


	class RunTestsStage(Stage):

	priority = StepOrder.TEST
	config_block = 'tests'

	def run(self, ctx: TestRun) -> None:
	if ctx.config.no_tests:
	logger.info("Skiping tests, as 'no_tests' config settings is True")
	return

	for suite_idx, test_suite in enumerate(ctx.config.get('tests', [])):
	test_nodes = [node for node in ctx.nodes if 'testnode' in node.info.roles]

	if not test_nodes:
	logger.error("No test nodes found")
	raise utils.StopTestError()

	if len(test_suite) != 1:
	logger.error("Test suite %s contain more than one test. Put each test in separated group", suite_idx)
	raise utils.StopTestError()

	name, params = list(test_suite.items())[0]
	vm_count = params.get('node_limit', None) # type: Optional[int]

	# select test nodes
	if vm_count is None:
	curr_test_nodes = test_nodes
	else:
	curr_test_nodes = test_nodes[:vm_count]

	if not curr_test_nodes:
	logger.error("No nodes found for test, skipping it.")
	continue

	if name not in all_suits:
	logger.error("Test suite %r not found. Only suits [%s] available", name, ", ".join(all_suits))
	raise utils.StopTestError()

	test_cls = all_suits[name]
	remote_dir = ctx.config.default_test_local_folder.format(name=name, uuid=ctx.config.run_uuid)
	suite = SuiteConfig(test_cls.name,
	params=params,
	run_uuid=ctx.config.run_uuid,
	nodes=test_nodes,
	remote_dir=remote_dir,
	idx=suite_idx,
	keep_raw_files=ctx.config.keep_raw_files)

	test_cls(storage=ctx.rstorage,
	suite=suite,
	on_tests_boundry=lambda before_test: collect_sensors_data(ctx, False, before_test)).run()

	@classmethod
	def validate_config(cls, cfg: ConfigBlock) -> None:
	pass


	class SaveNodesStage(Stage):
	"""Save nodes list to file"""
	priority = StepOrder.UPDATE_NODES_INFO + 1

	def run(self, ctx: TestRun) -> None:
	infos = list(ctx.nodes_info.values())
	params = {node.node_id: node.params for node in infos}
	ninfos = [copy.copy(node) for node in infos]
	for node in ninfos:
	node.params = {"in file": WallyDB.nodes_params}
	ctx.storage.put_list(ninfos, WallyDB.all_nodes)
	ctx.storage.put_raw(json.dumps(params).encode('utf8'), WallyDB.nodes_params)


	class LoadStoredNodesStage(Stage):
	priority = StepOrder.DISCOVER

	def run(self, ctx: TestRun) -> None:
	if WallyDB.all_nodes in ctx.storage:
	if ctx.nodes_info:
	logger.error("Internal error: Some nodes already stored in " +
	"nodes_info before LoadStoredNodesStage stage")
	raise utils.StopTestError()

	ctx.nodes_info = {node.node_id: node for node in ctx.rstorage.load_nodes()}
	logger.info("%s nodes loaded from database", len(ctx.nodes_info))