blob: 111e6bbcac39b266881f3b83086b64b240707501 [file] [log] [blame]
koder aka kdanilov4643fd62015-02-10 16:20:13 -08001import abc
koder aka kdanilovbc2c8982015-06-13 02:50:43 +03002import time
3import logging
koder aka kdanilov4643fd62015-02-10 16:20:13 -08004import os.path
kdanylov aka koder2e5fce12017-05-23 01:47:36 +03005from typing import Any, List, Optional, Callable, Iterable, cast
koder aka kdanilov652cd802015-04-13 12:21:07 +03006
koder aka kdanilova732a602017-02-01 20:29:56 +02007from concurrent.futures import ThreadPoolExecutor, wait
koder aka kdanilov4643fd62015-02-10 16:20:13 -08008
kdanylov aka koder026e5f22017-05-15 01:04:39 +03009from cephlib.node import IRPCNode
kdanylov aka koder2e5fce12017-05-23 01:47:36 +030010from cephlib.units import unit_conversion_coef_f
kdanylov aka koder026e5f22017-05-15 01:04:39 +030011
koder aka kdanilov108ac362017-01-19 20:17:16 +020012from ..utils import StopTestError, get_time_interval_printable_info
kdanylov aka koder026e5f22017-05-15 01:04:39 +030013from ..result_classes import SuiteConfig, JobConfig, TimeSeries, IWallyStorage
koder aka kdanilov70227062016-11-26 23:23:21 +020014
koder aka kdanilov4af1c1d2015-05-18 15:48:58 +030015
koder aka kdanilovbc2c8982015-06-13 02:50:43 +030016logger = logging.getLogger("wally")
koder aka kdanilov88407ff2015-05-26 15:35:57 +030017
18
koder aka kdanilov70227062016-11-26 23:23:21 +020019__doc__ = "Contains base classes for performance tests"
20
21
koder aka kdanilov7f59d562016-12-26 01:34:23 +020022class PerfTest(metaclass=abc.ABCMeta):
koder aka kdanilov70227062016-11-26 23:23:21 +020023 """Base class for all tests"""
24 name = None # type: str
25 max_retry = 3
26 retry_time = 30
koder aka kdanilovf2865172016-12-30 03:35:11 +020027 job_config_cls = None # type: type
koder aka kdanilov70227062016-11-26 23:23:21 +020028
kdanylov aka koder13e58452018-07-15 02:51:51 +030029 def __init__(self, storage: IWallyStorage, suite: SuiteConfig,
30 on_tests_boundry: Callable[[bool], None] = None) -> None:
koder aka kdanilov108ac362017-01-19 20:17:16 +020031 self.suite = suite
koder aka kdanilove2de58c2015-04-24 22:59:36 +030032 self.stop_requested = False
koder aka kdanilov108ac362017-01-19 20:17:16 +020033 self.sorted_nodes_ids = sorted(node.node_id for node in self.suite.nodes)
kdanylov aka koder13e58452018-07-15 02:51:51 +030034 self.on_tests_boundry = on_tests_boundry
koder aka kdanilovf2865172016-12-30 03:35:11 +020035 self.storage = storage
koder aka kdanilove2de58c2015-04-24 22:59:36 +030036
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +030037 def request_stop(self) -> None:
koder aka kdanilove2de58c2015-04-24 22:59:36 +030038 self.stop_requested = True
koder aka kdanilov2066daf2015-04-23 21:05:41 +030039
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +030040 def join_remote(self, path: str) -> str:
koder aka kdanilov108ac362017-01-19 20:17:16 +020041 return os.path.join(self.suite.remote_dir, path)
koder aka kdanilov4500a5f2015-04-17 16:55:17 +030042
koder aka kdanilov4af1c1d2015-05-18 15:48:58 +030043 @abc.abstractmethod
koder aka kdanilovbbbe1dc2016-12-20 01:19:56 +020044 def run(self) -> None:
koder aka kdanilov4643fd62015-02-10 16:20:13 -080045 pass
46
koder aka kdanilovbc2c8982015-06-13 02:50:43 +030047 @abc.abstractmethod
koder aka kdanilov39e449e2016-12-17 15:15:26 +020048 def format_for_console(self, data: Any) -> str:
koder aka kdanilovec1b9732015-04-23 20:43:29 +030049 pass
50
koder aka kdanilov4643fd62015-02-10 16:20:13 -080051
koder aka kdanilov70227062016-11-26 23:23:21 +020052class ThreadedTest(PerfTest, metaclass=abc.ABCMeta):
53 """Base class for tests, which spawn separated thread for each node"""
54
55 # max allowed time difference between starts and stops of run of the same test on different test nodes
56 # used_max_diff = max((min_run_time * max_rel_time_diff), max_time_diff)
57 max_time_diff = 5
58 max_rel_time_diff = 0.05
kdanylov aka koder13e58452018-07-15 02:51:51 +030059 load_profile_name: str = None # type: ignore
koder aka kdanilov70227062016-11-26 23:23:21 +020060
koder aka kdanilov7f59d562016-12-26 01:34:23 +020061 def __init__(self, *args, **kwargs) -> None:
62 PerfTest.__init__(self, *args, **kwargs)
kdanylov aka koder13e58452018-07-15 02:51:51 +030063 self.job_configs: List[JobConfig] = None # type: ignore
koder aka kdanilovbc2c8982015-06-13 02:50:43 +030064
65 @abc.abstractmethod
koder aka kdanilova732a602017-02-01 20:29:56 +020066 def get_expected_runtime(self, iter_cfg: JobConfig) -> Optional[int]:
koder aka kdanilovbc2c8982015-06-13 02:50:43 +030067 pass
68
koder aka kdanilova732a602017-02-01 20:29:56 +020069 def get_not_done_jobs(self) -> Iterable[JobConfig]:
koder aka kdanilov108ac362017-01-19 20:17:16 +020070 jobs_map = {job.storage_id: job for job in self.job_configs}
71 already_in_storage = set()
koder aka kdanilova732a602017-02-01 20:29:56 +020072 for db_config in cast(List[JobConfig], self.storage.iter_job(self.suite)):
koder aka kdanilov108ac362017-01-19 20:17:16 +020073 if db_config.storage_id in jobs_map:
74 job = jobs_map[db_config.storage_id]
75 if job != db_config:
76 logger.error("Test info at '%s.%s' is not equal to expected config for iteration %s.%s." +
koder aka kdanilovf2865172016-12-30 03:35:11 +020077 " Maybe configuration was changed before test was restarted. " +
78 "DB cfg is:\n %s\nExpected cfg is:\n %s\nFix DB or rerun test from beginning",
koder aka kdanilov108ac362017-01-19 20:17:16 +020079 self.suite.storage_id, job.storage_id, self.name, job.summary,
koder aka kdanilovf2865172016-12-30 03:35:11 +020080 str(db_config).replace("\n", "\n "),
koder aka kdanilov108ac362017-01-19 20:17:16 +020081 str(job).replace("\n", "\n "))
koder aka kdanilovf2865172016-12-30 03:35:11 +020082 raise StopTestError()
koder aka kdanilovbbbe1dc2016-12-20 01:19:56 +020083
koder aka kdanilov108ac362017-01-19 20:17:16 +020084 logger.info("Test iteration %s.%s found in storage and will be skipped", self.name, job.summary)
85 already_in_storage.add(db_config.storage_id)
86
87 return [job for job in self.job_configs if job.storage_id not in already_in_storage]
koder aka kdanilov70227062016-11-26 23:23:21 +020088
koder aka kdanilovbbbe1dc2016-12-20 01:19:56 +020089 def run(self) -> None:
koder aka kdanilov108ac362017-01-19 20:17:16 +020090 self.storage.put_or_check_suite(self.suite)
koder aka kdanilovf2865172016-12-30 03:35:11 +020091
koder aka kdanilov108ac362017-01-19 20:17:16 +020092 not_in_storage = list(self.get_not_done_jobs())
koder aka kdanilov70227062016-11-26 23:23:21 +020093 if not not_in_storage:
94 logger.info("All test iteration in storage already. Skip test")
95 return
96
koder aka kdanilovf2865172016-12-30 03:35:11 +020097 logger.debug("Run test %s with profile %r on nodes %s.", self.name,
98 self.load_profile_name,
99 ",".join(self.sorted_nodes_ids))
koder aka kdanilov23e6bdf2016-12-24 02:18:54 +0200100 logger.debug("Prepare nodes")
koder aka kdanilov70227062016-11-26 23:23:21 +0200101
koder aka kdanilov70227062016-11-26 23:23:21 +0200102
koder aka kdanilov108ac362017-01-19 20:17:16 +0200103 with ThreadPoolExecutor(len(self.suite.nodes)) as pool:
koder aka kdanilovf2865172016-12-30 03:35:11 +0200104 # config nodes
koder aka kdanilov108ac362017-01-19 20:17:16 +0200105 list(pool.map(self.config_node, self.suite.nodes))
koder aka kdanilovf2865172016-12-30 03:35:11 +0200106
koder aka kdanilov108ac362017-01-19 20:17:16 +0200107 run_times = list(map(self.get_expected_runtime, not_in_storage))
koder aka kdanilovf2865172016-12-30 03:35:11 +0200108
koder aka kdanilov70227062016-11-26 23:23:21 +0200109 if None not in run_times:
kdanylov aka koder84de1e42017-05-22 14:00:07 +0300110 # +10s - is a rough estimation for additional operations per iteration
kdanylov aka koder13e58452018-07-15 02:51:51 +0300111 expected_run_time: int = int(sum(run_times) + 10 * len(not_in_storage)) # type: ignore
koder aka kdanilovf2865172016-12-30 03:35:11 +0200112 exec_time_s, end_dt_s = get_time_interval_printable_info(expected_run_time)
kdanylov aka koder736e5c12017-05-07 17:27:14 +0300113 logger.info("Entire test should takes around %s and finish at %s", exec_time_s, end_dt_s)
koder aka kdanilov70227062016-11-26 23:23:21 +0200114
koder aka kdanilov108ac362017-01-19 20:17:16 +0200115 for job in not_in_storage:
kdanylov aka koder13e58452018-07-15 02:51:51 +0300116 results: List[TimeSeries] = []
koder aka kdanilov108ac362017-01-19 20:17:16 +0200117 for retry_idx in range(self.max_retry):
kdanylov aka koder3a9e5db2017-05-09 20:00:44 +0300118 logger.info("Preparing job %s", job.params.summary)
koder aka kdanilov70227062016-11-26 23:23:21 +0200119
koder aka kdanilov23e6bdf2016-12-24 02:18:54 +0200120 # prepare nodes for new iterations
koder aka kdanilov108ac362017-01-19 20:17:16 +0200121 wait([pool.submit(self.prepare_iteration, node, job) for node in self.suite.nodes])
koder aka kdanilov23e6bdf2016-12-24 02:18:54 +0200122
koder aka kdanilov108ac362017-01-19 20:17:16 +0200123 expected_job_time = self.get_expected_runtime(job)
kdanylov aka koder13e58452018-07-15 02:51:51 +0300124 if expected_job_time is None:
125 logger.info("Job execution time is unknown")
126 else:
127 exec_time_s, end_dt_s = get_time_interval_printable_info(expected_job_time)
128 logger.info("Job should takes around %s and finish at %s", exec_time_s, end_dt_s)
129
130 if self.on_tests_boundry is not None:
131 self.on_tests_boundry(True)
koder aka kdanilovf2865172016-12-30 03:35:11 +0200132
koder aka kdanilov108ac362017-01-19 20:17:16 +0200133 jfutures = [pool.submit(self.run_iteration, node, job) for node in self.suite.nodes]
134 failed = False
135 for future in jfutures:
136 try:
137 results.extend(future.result())
138 except EnvironmentError:
139 failed = True
koder aka kdanilov70227062016-11-26 23:23:21 +0200140
kdanylov aka koder13e58452018-07-15 02:51:51 +0300141 if self.on_tests_boundry is not None:
142 self.on_tests_boundry(False)
143
koder aka kdanilov108ac362017-01-19 20:17:16 +0200144 if not failed:
145 break
146
147 if self.max_retry - 1 == retry_idx:
148 logger.exception("Fio failed")
149 raise StopTestError()
150
151 logger.exception("During fio run")
152 logger.info("Sleeping %ss and retrying job", self.retry_time)
153 time.sleep(self.retry_time)
154 results = []
155
156 # per node jobs start and stop times
kdanylov aka koder13e58452018-07-15 02:51:51 +0300157 start_times: List[int] = []
158 stop_times: List[int] = []
koder aka kdanilov70227062016-11-26 23:23:21 +0200159
koder aka kdanilov108ac362017-01-19 20:17:16 +0200160 for ts in results:
161 self.storage.put_ts(ts)
162 if len(ts.times) >= 2: # type: ignore
163 start_times.append(ts.times[0])
164 stop_times.append(ts.times[-1])
koder aka kdanilov70227062016-11-26 23:23:21 +0200165
koder aka kdanilovf2865172016-12-30 03:35:11 +0200166 if len(start_times) > 0:
167 min_start_time = min(start_times)
168 max_start_time = max(start_times)
169 min_stop_time = min(stop_times)
koder aka kdanilov70227062016-11-26 23:23:21 +0200170
koder aka kdanilovf2865172016-12-30 03:35:11 +0200171 max_allowed_time_diff = int((min_stop_time - max_start_time) * self.max_rel_time_diff)
172 max_allowed_time_diff = max(max_allowed_time_diff, self.max_time_diff)
koder aka kdanilov70227062016-11-26 23:23:21 +0200173
koder aka kdanilovf2865172016-12-30 03:35:11 +0200174 if min_start_time + self.max_time_diff < max_allowed_time_diff:
175 logger.warning("Too large difference in %s:%s start time - %s. " +
176 "Max recommended difference is %s",
koder aka kdanilov108ac362017-01-19 20:17:16 +0200177 self.name, job.summary,
koder aka kdanilovf2865172016-12-30 03:35:11 +0200178 max_start_time - min_start_time, self.max_time_diff)
koder aka kdanilov70227062016-11-26 23:23:21 +0200179
koder aka kdanilovf2865172016-12-30 03:35:11 +0200180 if min_stop_time + self.max_time_diff < max_allowed_time_diff:
181 logger.warning("Too large difference in %s:%s stop time - %s. " +
182 "Max recommended difference is %s",
koder aka kdanilov108ac362017-01-19 20:17:16 +0200183 self.name, job.summary,
koder aka kdanilovf2865172016-12-30 03:35:11 +0200184 max_start_time - min_start_time, self.max_time_diff)
koder aka kdanilov70227062016-11-26 23:23:21 +0200185
kdanylov aka koder2e5fce12017-05-23 01:47:36 +0300186 one_s = int(unit_conversion_coef_f('s', results[0].time_units))
187 job.reliable_info_range = (int(max_start_time) + one_s, int(min_stop_time) - one_s)
koder aka kdanilov108ac362017-01-19 20:17:16 +0200188
189 self.storage.put_job(self.suite, job)
koder aka kdanilovf2865172016-12-30 03:35:11 +0200190 self.storage.sync()
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200191
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200192
koder aka kdanilov70227062016-11-26 23:23:21 +0200193 @abc.abstractmethod
194 def config_node(self, node: IRPCNode) -> None:
195 pass
196
197 @abc.abstractmethod
koder aka kdanilova732a602017-02-01 20:29:56 +0200198 def prepare_iteration(self, node: IRPCNode, job: JobConfig) -> None:
koder aka kdanilov23e6bdf2016-12-24 02:18:54 +0200199 pass
200
201 @abc.abstractmethod
koder aka kdanilova732a602017-02-01 20:29:56 +0200202 def run_iteration(self, node: IRPCNode, job: JobConfig) -> List[TimeSeries]:
koder aka kdanilovbc2c8982015-06-13 02:50:43 +0300203 pass
204
205
koder aka kdanilov70227062016-11-26 23:23:21 +0200206class TwoScriptTest(ThreadedTest, metaclass=abc.ABCMeta):
207 def __init__(self, *dt, **mp) -> None:
koder aka kdanilovbc2c8982015-06-13 02:50:43 +0300208 ThreadedTest.__init__(self, *dt, **mp)
koder aka kdanilov108ac362017-01-19 20:17:16 +0200209 self.prerun_script = self.suite.params['prerun_script']
210 self.run_script = self.suite.params['run_script']
211 self.prerun_tout = self.suite.params.get('prerun_tout', 3600)
212 self.run_tout = self.suite.params.get('run_tout', 3600)
213 # TODO: fix job_configs field
214 raise NotImplementedError("Fix job configs")
Yulia Portnova7ddfa732015-02-24 17:32:58 +0200215
koder aka kdanilova732a602017-02-01 20:29:56 +0200216 def get_expected_runtime(self, job: JobConfig) -> Optional[int]:
koder aka kdanilov70227062016-11-26 23:23:21 +0200217 return None
Yulia Portnova7ddfa732015-02-24 17:32:58 +0200218
koder aka kdanilov70227062016-11-26 23:23:21 +0200219 def config_node(self, node: IRPCNode) -> None:
220 node.copy_file(self.run_script, self.join_remote(self.run_script))
221 node.copy_file(self.prerun_script, self.join_remote(self.prerun_script))
koder aka kdanilovbc2c8982015-06-13 02:50:43 +0300222
koder aka kdanilov70227062016-11-26 23:23:21 +0200223 cmd = self.join_remote(self.prerun_script)
koder aka kdanilov108ac362017-01-19 20:17:16 +0200224 cmd += ' ' + self.suite.params.get('prerun_opts', '')
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300225 node.run(cmd, timeout=self.prerun_tout)
Yulia Portnova7ddfa732015-02-24 17:32:58 +0200226
koder aka kdanilova732a602017-02-01 20:29:56 +0200227 def prepare_iteration(self, node: IRPCNode, job: JobConfig) -> None:
koder aka kdanilov23e6bdf2016-12-24 02:18:54 +0200228 pass
229
koder aka kdanilova732a602017-02-01 20:29:56 +0200230 def run_iteration(self, node: IRPCNode, job: JobConfig) -> List[TimeSeries]:
koder aka kdanilov23e6bdf2016-12-24 02:18:54 +0200231 # TODO: have to store logs
koder aka kdanilov70227062016-11-26 23:23:21 +0200232 cmd = self.join_remote(self.run_script)
koder aka kdanilov108ac362017-01-19 20:17:16 +0200233 cmd += ' ' + self.suite.params.get('run_opts', '')
koder aka kdanilov23e6bdf2016-12-24 02:18:54 +0200234 return self.parse_results(node.run(cmd, timeout=self.run_tout))
koder aka kdanilov70227062016-11-26 23:23:21 +0200235
236 @abc.abstractmethod
koder aka kdanilov108ac362017-01-19 20:17:16 +0200237 def parse_results(self, data: str) -> List[TimeSeries]:
koder aka kdanilov70227062016-11-26 23:23:21 +0200238 pass
239