blob: a997b026a89a344d977501bc810292aec260067f [file] [log] [blame]
koder aka kdanilov4643fd62015-02-10 16:20:13 -08001import abc
koder aka kdanilovbc2c8982015-06-13 02:50:43 +03002import time
3import logging
koder aka kdanilov4643fd62015-02-10 16:20:13 -08004import os.path
kdanylov aka koder026e5f22017-05-15 01:04:39 +03005from typing import Any, List, Optional, Callable, Iterable, cast, Tuple
koder aka kdanilov652cd802015-04-13 12:21:07 +03006
koder aka kdanilova732a602017-02-01 20:29:56 +02007from concurrent.futures import ThreadPoolExecutor, wait
koder aka kdanilov4643fd62015-02-10 16:20:13 -08008
kdanylov aka koder026e5f22017-05-15 01:04:39 +03009from cephlib.node import IRPCNode
10
koder aka kdanilov108ac362017-01-19 20:17:16 +020011from ..utils import StopTestError, get_time_interval_printable_info
kdanylov aka koder026e5f22017-05-15 01:04:39 +030012from ..result_classes import SuiteConfig, JobConfig, TimeSeries, IWallyStorage
koder aka kdanilov70227062016-11-26 23:23:21 +020013
koder aka kdanilov4af1c1d2015-05-18 15:48:58 +030014
koder aka kdanilovbc2c8982015-06-13 02:50:43 +030015logger = logging.getLogger("wally")
koder aka kdanilov88407ff2015-05-26 15:35:57 +030016
17
koder aka kdanilov70227062016-11-26 23:23:21 +020018__doc__ = "Contains base classes for performance tests"
19
20
koder aka kdanilov7f59d562016-12-26 01:34:23 +020021class PerfTest(metaclass=abc.ABCMeta):
koder aka kdanilov70227062016-11-26 23:23:21 +020022 """Base class for all tests"""
23 name = None # type: str
24 max_retry = 3
25 retry_time = 30
koder aka kdanilovf2865172016-12-30 03:35:11 +020026 job_config_cls = None # type: type
koder aka kdanilov70227062016-11-26 23:23:21 +020027
kdanylov aka koder026e5f22017-05-15 01:04:39 +030028 def __init__(self, storage: IWallyStorage, suite: SuiteConfig, on_idle: Callable[[], None] = None) -> None:
koder aka kdanilov108ac362017-01-19 20:17:16 +020029 self.suite = suite
koder aka kdanilove2de58c2015-04-24 22:59:36 +030030 self.stop_requested = False
koder aka kdanilov108ac362017-01-19 20:17:16 +020031 self.sorted_nodes_ids = sorted(node.node_id for node in self.suite.nodes)
koder aka kdanilov7f59d562016-12-26 01:34:23 +020032 self.on_idle = on_idle
koder aka kdanilovf2865172016-12-30 03:35:11 +020033 self.storage = storage
koder aka kdanilove2de58c2015-04-24 22:59:36 +030034
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +030035 def request_stop(self) -> None:
koder aka kdanilove2de58c2015-04-24 22:59:36 +030036 self.stop_requested = True
koder aka kdanilov2066daf2015-04-23 21:05:41 +030037
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +030038 def join_remote(self, path: str) -> str:
koder aka kdanilov108ac362017-01-19 20:17:16 +020039 return os.path.join(self.suite.remote_dir, path)
koder aka kdanilov4500a5f2015-04-17 16:55:17 +030040
koder aka kdanilov4af1c1d2015-05-18 15:48:58 +030041 @abc.abstractmethod
koder aka kdanilovbbbe1dc2016-12-20 01:19:56 +020042 def run(self) -> None:
koder aka kdanilov4643fd62015-02-10 16:20:13 -080043 pass
44
koder aka kdanilovbc2c8982015-06-13 02:50:43 +030045 @abc.abstractmethod
koder aka kdanilov39e449e2016-12-17 15:15:26 +020046 def format_for_console(self, data: Any) -> str:
koder aka kdanilovec1b9732015-04-23 20:43:29 +030047 pass
48
koder aka kdanilov4643fd62015-02-10 16:20:13 -080049
koder aka kdanilov70227062016-11-26 23:23:21 +020050class ThreadedTest(PerfTest, metaclass=abc.ABCMeta):
51 """Base class for tests, which spawn separated thread for each node"""
52
53 # max allowed time difference between starts and stops of run of the same test on different test nodes
54 # used_max_diff = max((min_run_time * max_rel_time_diff), max_time_diff)
55 max_time_diff = 5
56 max_rel_time_diff = 0.05
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020057 load_profile_name = None # type: str
koder aka kdanilov70227062016-11-26 23:23:21 +020058
koder aka kdanilov7f59d562016-12-26 01:34:23 +020059 def __init__(self, *args, **kwargs) -> None:
60 PerfTest.__init__(self, *args, **kwargs)
koder aka kdanilova732a602017-02-01 20:29:56 +020061 self.job_configs = None # type: List[JobConfig]
koder aka kdanilovbc2c8982015-06-13 02:50:43 +030062
63 @abc.abstractmethod
koder aka kdanilova732a602017-02-01 20:29:56 +020064 def get_expected_runtime(self, iter_cfg: JobConfig) -> Optional[int]:
koder aka kdanilovbc2c8982015-06-13 02:50:43 +030065 pass
66
koder aka kdanilova732a602017-02-01 20:29:56 +020067 def get_not_done_jobs(self) -> Iterable[JobConfig]:
koder aka kdanilov108ac362017-01-19 20:17:16 +020068 jobs_map = {job.storage_id: job for job in self.job_configs}
69 already_in_storage = set()
koder aka kdanilova732a602017-02-01 20:29:56 +020070 for db_config in cast(List[JobConfig], self.storage.iter_job(self.suite)):
koder aka kdanilov108ac362017-01-19 20:17:16 +020071 if db_config.storage_id in jobs_map:
72 job = jobs_map[db_config.storage_id]
73 if job != db_config:
74 logger.error("Test info at '%s.%s' is not equal to expected config for iteration %s.%s." +
koder aka kdanilovf2865172016-12-30 03:35:11 +020075 " Maybe configuration was changed before test was restarted. " +
76 "DB cfg is:\n %s\nExpected cfg is:\n %s\nFix DB or rerun test from beginning",
koder aka kdanilov108ac362017-01-19 20:17:16 +020077 self.suite.storage_id, job.storage_id, self.name, job.summary,
koder aka kdanilovf2865172016-12-30 03:35:11 +020078 str(db_config).replace("\n", "\n "),
koder aka kdanilov108ac362017-01-19 20:17:16 +020079 str(job).replace("\n", "\n "))
koder aka kdanilovf2865172016-12-30 03:35:11 +020080 raise StopTestError()
koder aka kdanilovbbbe1dc2016-12-20 01:19:56 +020081
koder aka kdanilov108ac362017-01-19 20:17:16 +020082 logger.info("Test iteration %s.%s found in storage and will be skipped", self.name, job.summary)
83 already_in_storage.add(db_config.storage_id)
84
85 return [job for job in self.job_configs if job.storage_id not in already_in_storage]
koder aka kdanilov70227062016-11-26 23:23:21 +020086
koder aka kdanilovbbbe1dc2016-12-20 01:19:56 +020087 def run(self) -> None:
koder aka kdanilov108ac362017-01-19 20:17:16 +020088 self.storage.put_or_check_suite(self.suite)
koder aka kdanilovf2865172016-12-30 03:35:11 +020089
koder aka kdanilov108ac362017-01-19 20:17:16 +020090 not_in_storage = list(self.get_not_done_jobs())
koder aka kdanilov70227062016-11-26 23:23:21 +020091 if not not_in_storage:
92 logger.info("All test iteration in storage already. Skip test")
93 return
94
koder aka kdanilovf2865172016-12-30 03:35:11 +020095 logger.debug("Run test %s with profile %r on nodes %s.", self.name,
96 self.load_profile_name,
97 ",".join(self.sorted_nodes_ids))
koder aka kdanilov23e6bdf2016-12-24 02:18:54 +020098 logger.debug("Prepare nodes")
koder aka kdanilov70227062016-11-26 23:23:21 +020099
koder aka kdanilov70227062016-11-26 23:23:21 +0200100
koder aka kdanilov108ac362017-01-19 20:17:16 +0200101 with ThreadPoolExecutor(len(self.suite.nodes)) as pool:
koder aka kdanilovf2865172016-12-30 03:35:11 +0200102 # config nodes
koder aka kdanilov108ac362017-01-19 20:17:16 +0200103 list(pool.map(self.config_node, self.suite.nodes))
koder aka kdanilovf2865172016-12-30 03:35:11 +0200104
koder aka kdanilov108ac362017-01-19 20:17:16 +0200105 run_times = list(map(self.get_expected_runtime, not_in_storage))
koder aka kdanilovf2865172016-12-30 03:35:11 +0200106
koder aka kdanilov70227062016-11-26 23:23:21 +0200107 if None not in run_times:
kdanylov aka koder84de1e42017-05-22 14:00:07 +0300108 # +10s - is a rough estimation for additional operations per iteration
109 expected_run_time = int(sum(run_times) + 10 * len(not_in_storage))
koder aka kdanilov70227062016-11-26 23:23:21 +0200110
koder aka kdanilovf2865172016-12-30 03:35:11 +0200111 exec_time_s, end_dt_s = get_time_interval_printable_info(expected_run_time)
kdanylov aka koder736e5c12017-05-07 17:27:14 +0300112 logger.info("Entire test should takes around %s and finish at %s", exec_time_s, end_dt_s)
koder aka kdanilov70227062016-11-26 23:23:21 +0200113
koder aka kdanilov108ac362017-01-19 20:17:16 +0200114 for job in not_in_storage:
115 results = [] # type: List[TimeSeries]
116 for retry_idx in range(self.max_retry):
kdanylov aka koder3a9e5db2017-05-09 20:00:44 +0300117 logger.info("Preparing job %s", job.params.summary)
koder aka kdanilov70227062016-11-26 23:23:21 +0200118
koder aka kdanilov23e6bdf2016-12-24 02:18:54 +0200119 # prepare nodes for new iterations
koder aka kdanilov108ac362017-01-19 20:17:16 +0200120 wait([pool.submit(self.prepare_iteration, node, job) for node in self.suite.nodes])
koder aka kdanilov23e6bdf2016-12-24 02:18:54 +0200121
koder aka kdanilov108ac362017-01-19 20:17:16 +0200122 expected_job_time = self.get_expected_runtime(job)
koder aka kdanilovf2865172016-12-30 03:35:11 +0200123 exec_time_s, end_dt_s = get_time_interval_printable_info(expected_job_time)
kdanylov aka koder736e5c12017-05-07 17:27:14 +0300124 logger.info("Job should takes around %s and finish at %s", exec_time_s, end_dt_s)
koder aka kdanilovf2865172016-12-30 03:35:11 +0200125
koder aka kdanilov108ac362017-01-19 20:17:16 +0200126 jfutures = [pool.submit(self.run_iteration, node, job) for node in self.suite.nodes]
127 failed = False
128 for future in jfutures:
129 try:
130 results.extend(future.result())
131 except EnvironmentError:
132 failed = True
koder aka kdanilov70227062016-11-26 23:23:21 +0200133
koder aka kdanilov108ac362017-01-19 20:17:16 +0200134 if not failed:
135 break
136
137 if self.max_retry - 1 == retry_idx:
138 logger.exception("Fio failed")
139 raise StopTestError()
140
141 logger.exception("During fio run")
142 logger.info("Sleeping %ss and retrying job", self.retry_time)
143 time.sleep(self.retry_time)
144 results = []
145
146 # per node jobs start and stop times
koder aka kdanilov70227062016-11-26 23:23:21 +0200147 start_times = [] # type: List[int]
148 stop_times = [] # type: List[int]
149
koder aka kdanilov108ac362017-01-19 20:17:16 +0200150 for ts in results:
151 self.storage.put_ts(ts)
152 if len(ts.times) >= 2: # type: ignore
153 start_times.append(ts.times[0])
154 stop_times.append(ts.times[-1])
koder aka kdanilov70227062016-11-26 23:23:21 +0200155
koder aka kdanilovf2865172016-12-30 03:35:11 +0200156 if len(start_times) > 0:
157 min_start_time = min(start_times)
158 max_start_time = max(start_times)
159 min_stop_time = min(stop_times)
koder aka kdanilov70227062016-11-26 23:23:21 +0200160
koder aka kdanilovf2865172016-12-30 03:35:11 +0200161 max_allowed_time_diff = int((min_stop_time - max_start_time) * self.max_rel_time_diff)
162 max_allowed_time_diff = max(max_allowed_time_diff, self.max_time_diff)
koder aka kdanilov70227062016-11-26 23:23:21 +0200163
koder aka kdanilovf2865172016-12-30 03:35:11 +0200164 if min_start_time + self.max_time_diff < max_allowed_time_diff:
165 logger.warning("Too large difference in %s:%s start time - %s. " +
166 "Max recommended difference is %s",
koder aka kdanilov108ac362017-01-19 20:17:16 +0200167 self.name, job.summary,
koder aka kdanilovf2865172016-12-30 03:35:11 +0200168 max_start_time - min_start_time, self.max_time_diff)
koder aka kdanilov70227062016-11-26 23:23:21 +0200169
koder aka kdanilovf2865172016-12-30 03:35:11 +0200170 if min_stop_time + self.max_time_diff < max_allowed_time_diff:
171 logger.warning("Too large difference in %s:%s stop time - %s. " +
172 "Max recommended difference is %s",
koder aka kdanilov108ac362017-01-19 20:17:16 +0200173 self.name, job.summary,
koder aka kdanilovf2865172016-12-30 03:35:11 +0200174 max_start_time - min_start_time, self.max_time_diff)
koder aka kdanilov70227062016-11-26 23:23:21 +0200175
koder aka kdanilova732a602017-02-01 20:29:56 +0200176 job.reliable_info_range = (int(max_start_time), int(min_stop_time))
koder aka kdanilov108ac362017-01-19 20:17:16 +0200177
178 self.storage.put_job(self.suite, job)
koder aka kdanilovf2865172016-12-30 03:35:11 +0200179 self.storage.sync()
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200180
181 if self.on_idle is not None:
182 self.on_idle()
183
koder aka kdanilov70227062016-11-26 23:23:21 +0200184 @abc.abstractmethod
185 def config_node(self, node: IRPCNode) -> None:
186 pass
187
188 @abc.abstractmethod
koder aka kdanilova732a602017-02-01 20:29:56 +0200189 def prepare_iteration(self, node: IRPCNode, job: JobConfig) -> None:
koder aka kdanilov23e6bdf2016-12-24 02:18:54 +0200190 pass
191
192 @abc.abstractmethod
koder aka kdanilova732a602017-02-01 20:29:56 +0200193 def run_iteration(self, node: IRPCNode, job: JobConfig) -> List[TimeSeries]:
koder aka kdanilovbc2c8982015-06-13 02:50:43 +0300194 pass
195
196
koder aka kdanilov70227062016-11-26 23:23:21 +0200197class TwoScriptTest(ThreadedTest, metaclass=abc.ABCMeta):
198 def __init__(self, *dt, **mp) -> None:
koder aka kdanilovbc2c8982015-06-13 02:50:43 +0300199 ThreadedTest.__init__(self, *dt, **mp)
koder aka kdanilov108ac362017-01-19 20:17:16 +0200200 self.prerun_script = self.suite.params['prerun_script']
201 self.run_script = self.suite.params['run_script']
202 self.prerun_tout = self.suite.params.get('prerun_tout', 3600)
203 self.run_tout = self.suite.params.get('run_tout', 3600)
204 # TODO: fix job_configs field
205 raise NotImplementedError("Fix job configs")
Yulia Portnova7ddfa732015-02-24 17:32:58 +0200206
koder aka kdanilova732a602017-02-01 20:29:56 +0200207 def get_expected_runtime(self, job: JobConfig) -> Optional[int]:
koder aka kdanilov70227062016-11-26 23:23:21 +0200208 return None
Yulia Portnova7ddfa732015-02-24 17:32:58 +0200209
koder aka kdanilov70227062016-11-26 23:23:21 +0200210 def config_node(self, node: IRPCNode) -> None:
211 node.copy_file(self.run_script, self.join_remote(self.run_script))
212 node.copy_file(self.prerun_script, self.join_remote(self.prerun_script))
koder aka kdanilovbc2c8982015-06-13 02:50:43 +0300213
koder aka kdanilov70227062016-11-26 23:23:21 +0200214 cmd = self.join_remote(self.prerun_script)
koder aka kdanilov108ac362017-01-19 20:17:16 +0200215 cmd += ' ' + self.suite.params.get('prerun_opts', '')
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300216 node.run(cmd, timeout=self.prerun_tout)
Yulia Portnova7ddfa732015-02-24 17:32:58 +0200217
koder aka kdanilova732a602017-02-01 20:29:56 +0200218 def prepare_iteration(self, node: IRPCNode, job: JobConfig) -> None:
koder aka kdanilov23e6bdf2016-12-24 02:18:54 +0200219 pass
220
koder aka kdanilova732a602017-02-01 20:29:56 +0200221 def run_iteration(self, node: IRPCNode, job: JobConfig) -> List[TimeSeries]:
koder aka kdanilov23e6bdf2016-12-24 02:18:54 +0200222 # TODO: have to store logs
koder aka kdanilov70227062016-11-26 23:23:21 +0200223 cmd = self.join_remote(self.run_script)
koder aka kdanilov108ac362017-01-19 20:17:16 +0200224 cmd += ' ' + self.suite.params.get('run_opts', '')
koder aka kdanilov23e6bdf2016-12-24 02:18:54 +0200225 return self.parse_results(node.run(cmd, timeout=self.run_tout))
koder aka kdanilov70227062016-11-26 23:23:21 +0200226
227 @abc.abstractmethod
koder aka kdanilov108ac362017-01-19 20:17:16 +0200228 def parse_results(self, data: str) -> List[TimeSeries]:
koder aka kdanilov70227062016-11-26 23:23:21 +0200229 pass
230