| Alex | 0989ecf | 2022-03-29 13:43:21 -0500 | [diff] [blame] | 1 | #    Author: Alex Savatieiev (osavatieiev@mirantis.com; a.savex@gmail.com) | 
 | 2 | #    Copyright 2019-2022 Mirantis, Inc. | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 3 | import csv | 
 | 4 | import os | 
 | 5 | import json | 
 | 6 |  | 
| Alex | bdc7274 | 2021-12-23 13:26:05 -0600 | [diff] [blame] | 7 | from copy import deepcopy | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 8 | from datetime import datetime, timedelta, timezone | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 9 |  | 
| Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 10 | from cfg_checker.common import logger_cli | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 11 | from cfg_checker.common.decorators import retry | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 12 | from cfg_checker.common.file_utils import write_str_to_file | 
| Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 13 | from cfg_checker.common.other import utils | 
| Alex | bfa947c | 2021-11-11 18:14:28 -0600 | [diff] [blame] | 14 | from cfg_checker.helpers.console_utils import Progress | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 15 | from cfg_checker.helpers.console_utils import cl_typewriter | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 16 | from cfg_checker.reports import reporter | 
| Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 17 | # from cfg_checker.common.exception import InvalidReturnException | 
 | 18 | # from cfg_checker.common.exception import ConfigException | 
 | 19 | # from cfg_checker.common.exception import KubeException | 
 | 20 |  | 
 | 21 | from cfg_checker.nodes import KubeNodes | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 22 | from cfg_checker.agent.fio_runner import _get_seconds, _datetime_fmt | 
| Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 23 |  | 
 | 24 |  | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 25 | _file_datetime_fmt = "%m%d%Y%H%M%S%z" | 
 | 26 |  | 
 | 27 |  | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 28 | def _reformat_timestr(_str, _chars=["/", ",", " ", ":", "+"], _tchar=""): | 
 | 29 |     _new = "" | 
 | 30 |     for _c in _str: | 
 | 31 |         _new += _c if _c not in _chars else _tchar | 
 | 32 |     return _new | 
 | 33 |  | 
 | 34 |  | 
 | 35 | def _parse_json_output(buffer): | 
 | 36 |     try: | 
 | 37 |         return json.loads(buffer) | 
 | 38 |     except TypeError as e: | 
 | 39 |         logger_cli.error( | 
 | 40 |             "ERROR: Status not decoded: {}\n{}".format(e, buffer) | 
 | 41 |         ) | 
 | 42 |     except json.decoder.JSONDecodeError as e: | 
 | 43 |         logger_cli.error( | 
 | 44 |             "ERROR: Status not decoded: {}\n{}".format(e, buffer) | 
 | 45 |         ) | 
 | 46 |     return {} | 
 | 47 |  | 
 | 48 |  | 
| Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 49 | class CephBench(object): | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 50 |     _agent_template = "cfgagent-template.yaml" | 
 | 51 |  | 
 | 52 |     def __init__(self, config): | 
| Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 53 |         self.env_config = config | 
 | 54 |         return | 
 | 55 |  | 
| Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 56 |  | 
 | 57 | class SaltCephBench(CephBench): | 
 | 58 |     def __init__( | 
 | 59 |         self, | 
 | 60 |         config | 
 | 61 |     ): | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 62 |         logger_cli.error("ERROR: Not impelented for Salt environment!") | 
| Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 63 |  | 
 | 64 |         # self.master = SaltNodes(config) | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 65 |         super(SaltCephBench, self).__init__(config) | 
| Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 66 |         return | 
 | 67 |  | 
 | 68 |  | 
 | 69 | class KubeCephBench(CephBench): | 
 | 70 |     def __init__(self, config): | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 71 |         self.agent_count = config.bench_agent_count | 
| Alex | dcb792f | 2021-10-04 14:24:21 -0500 | [diff] [blame] | 72 |         self.master = KubeNodes(config) | 
 | 73 |         super(KubeCephBench, self).__init__(config) | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 74 |  | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 75 |         self.mode = config.bench_mode | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 76 |         self.resource_prefix = config.resource_prefix | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 77 |  | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 78 |         if config.bench_mode == "tasks": | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 79 |             self.taskfile = config.bench_task_file | 
 | 80 |             self.load_tasks(self.taskfile) | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 81 |  | 
 | 82 |         if config.bench_mode == "cleanup": | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 83 |             self.cleanup_list = [] | 
 | 84 |             return | 
 | 85 |  | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 86 |         self.bench_name = config.bench_name | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 87 |         self.results_dump_path = config.bench_results_dump_path | 
 | 88 |         self.results = {} | 
 | 89 |         self.agent_results = {} | 
 | 90 |         self.cleanup_list = [] | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 91 |         self.agent_pods = [] | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 92 |  | 
 | 93 |         if config.bench_mode == "report": | 
 | 94 |             self.results = {} | 
 | 95 |             return | 
 | 96 |  | 
 | 97 |         self.storage_class = config.bench_storage_class | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 98 |         self.services = [] | 
 | 99 |         # By default, | 
 | 100 |         # 30 seconds should be enough to send tasks to 3-5 agents | 
 | 101 |         self.scheduled_delay = 30 | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 102 |  | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 103 |     def set_ceph_info_class(self, ceph_info): | 
 | 104 |         self.ceph_info = ceph_info | 
| Alex | 2a7657c | 2021-11-10 20:51:34 -0600 | [diff] [blame] | 105 |  | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 106 |     def load_tasks(self, taskfile): | 
 | 107 |         # Load csv file | 
 | 108 |         logger_cli.info("-> Loading taskfile '{}'".format(taskfile)) | 
 | 109 |         self.tasks = [] | 
 | 110 |         with open(taskfile) as f: | 
 | 111 |             _reader = csv.reader(f, delimiter=',') | 
 | 112 |             # load packages | 
 | 113 |             for row in _reader: | 
 | 114 |                 self.tasks.append({ | 
 | 115 |                     "readwrite": row[0], | 
 | 116 |                     "rwmixread": row[1], | 
 | 117 |                     "bs": row[2], | 
 | 118 |                     "iodepth": row[3], | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 119 |                     "size": row[4], | 
 | 120 |                     "ramp_time": row[5], | 
 | 121 |                     "runtime": row[6] | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 122 |                 }) | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 123 |         logger_cli.info("-> Loaded {} tasks".format(len(self.tasks))) | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 124 |  | 
| Alex | 2a7657c | 2021-11-10 20:51:34 -0600 | [diff] [blame] | 125 |     def add_for_deletion(self, obj, typ): | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 126 |         self.cleanup_list.append( | 
 | 127 |             [ | 
 | 128 |                 typ, | 
 | 129 |                 obj.metadata.namespace, | 
 | 130 |                 obj.metadata.name | 
 | 131 |             ] | 
 | 132 |         ) | 
 | 133 |         return | 
 | 134 |  | 
 | 135 |     def prepare_cleanup(self): | 
 | 136 |         # Assume number of resources not given | 
 | 137 |         # list all svc, pod, pvc, pv and identify 'cfgagent-xx ones | 
 | 138 |         _types = ["pv", "pvc", "pod", "svc"] | 
 | 139 |         _prefix = self.resource_prefix | 
 | 140 |         for _typ in _types: | 
 | 141 |             _list = self.master.list_resource_names_by_type_and_ns(_typ) | 
 | 142 |             for ns, name in _list: | 
 | 143 |                 if name.startswith(_prefix): | 
 | 144 |                     if ns: | 
 | 145 |                         _msg = "{} {}/{}".format(_typ, ns, name) | 
 | 146 |                     else: | 
 | 147 |                         _msg = "{} {}".format(_typ, name) | 
 | 148 |                     logger_cli.info("-> Found {}".format(_msg)) | 
 | 149 |                     self.cleanup_list.append([_typ, ns, name]) | 
| Alex | 2a7657c | 2021-11-10 20:51:34 -0600 | [diff] [blame] | 150 |         return | 
 | 151 |  | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 152 |     def prepare_agents(self, options): | 
 | 153 |         logger_cli.info("# Preparing {} agents".format(self.agent_count)) | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 154 |         # Increase volume size a bit, so datafile fits | 
 | 155 |         _quanitizer = 1.3 | 
| Alex | 0bcf31b | 2022-03-29 17:38:58 -0500 | [diff] [blame] | 156 |         _v_size, _vol_size_units = utils.split_option_type(options['size']) | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 157 |         _v_size = round(_v_size * _quanitizer) | 
 | 158 |         _vol_size = str(_v_size) + _vol_size_units + "i" | 
 | 159 |         logger_cli.info( | 
 | 160 |             "-> Testfile size: {0}, Volume size: {1} ({0}*{2})".format( | 
 | 161 |                 options['size'], | 
 | 162 |                 _vol_size, | 
 | 163 |                 _quanitizer | 
 | 164 |             ) | 
 | 165 |         ) | 
 | 166 |         # Start preparing | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 167 |         for idx in range(self.agent_count): | 
 | 168 |             # create pvc/pv and pod | 
 | 169 |             logger_cli.info("-> creating agent '{:02}'".format(idx)) | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 170 |             # _agent, _pv, _pvc = self.master.prepare_benchmark_agent( | 
 | 171 |             _agent, _pvc = self.master.prepare_benchmark_agent( | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 172 |                 idx, | 
 | 173 |                 os.path.split(options["filename"])[0], | 
 | 174 |                 self.storage_class, | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 175 |                 _vol_size, | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 176 |                 self._agent_template | 
 | 177 |             ) | 
| Alex | 2a7657c | 2021-11-10 20:51:34 -0600 | [diff] [blame] | 178 |             # save it to lists | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 179 |             self.agent_pods.append(_agent) | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 180 |             # self.add_for_deletion(_pv, "pv") | 
| Alex | 2a7657c | 2021-11-10 20:51:34 -0600 | [diff] [blame] | 181 |             self.add_for_deletion(_pvc, "pvc") | 
 | 182 |             self.add_for_deletion(_agent, "pod") | 
 | 183 |  | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 184 |             # expose it | 
 | 185 |             _svc = self.master.expose_benchmark_agent(_agent) | 
| Alex | 2a7657c | 2021-11-10 20:51:34 -0600 | [diff] [blame] | 186 |             self.add_for_deletion(_svc, "svc") | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 187 |             # Save service | 
 | 188 |             self.services.append(_svc) | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 189 |             # prepopulate results | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 190 |             self.agent_results[_agent.metadata.name] = {} | 
 | 191 |             self.agent_results[_agent.metadata.name]["url"] = \ | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 192 |                 "http://{}:{}/api/".format( | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 193 |                     _svc.spec.cluster_ip, | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 194 |                     8765 | 
 | 195 |                 ) | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 196 |             self.agent_results[_agent.metadata.name]["storage_class"] = \ | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 197 |                 self.storage_class | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 198 |             self.agent_results[_agent.metadata.name]["volume_size"] = \ | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 199 |                 options['size'] | 
 | 200 |  | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 201 |         logger_cli.info("-> Done creating agents") | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 202 |         # TODO: Update after implementing pooled task sending | 
| Alex | defbfeb | 2022-11-08 12:17:54 -0600 | [diff] [blame] | 203 |         # idea is to have time to schedule task to each agent every 5 sec max | 
 | 204 |         self.scheduled_delay = self.agent_count * 5 | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 205 |         logger_cli.info( | 
 | 206 |             "-> Schedule delay set to {} sec".format(self.scheduled_delay) | 
 | 207 |         ) | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 208 |         return | 
 | 209 |  | 
 | 210 |     def _poke_agent(self, url, body, action="GET"): | 
 | 211 |         _datafile = "/tmp/data" | 
 | 212 |         _data = [ | 
| Alex | bfa947c | 2021-11-11 18:14:28 -0600 | [diff] [blame] | 213 |             "-d", | 
 | 214 |             "@" + _datafile | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 215 |         ] | 
 | 216 |         _cmd = [ | 
 | 217 |             "curl", | 
 | 218 |             "-s", | 
 | 219 |             "-H", | 
 | 220 |             "'Content-Type: application/json'", | 
 | 221 |             "-X", | 
 | 222 |             action, | 
 | 223 |             url | 
 | 224 |         ] | 
 | 225 |         if body: | 
 | 226 |             _cmd += _data | 
 | 227 |             _ret = self.master.prepare_json_in_pod( | 
 | 228 |                 self.agent_pods[0].metadata.name, | 
 | 229 |                 self.master._namespace, | 
 | 230 |                 body, | 
 | 231 |                 _datafile | 
 | 232 |             ) | 
 | 233 |         _ret = self.master.exec_cmd_on_target_pod( | 
 | 234 |             self.agent_pods[0].metadata.name, | 
 | 235 |             self.master._namespace, | 
 | 236 |             " ".join(_cmd) | 
 | 237 |         ) | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 238 |         return _parse_json_output(_ret) | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 239 |  | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 240 |     def _ensure_agents_ready(self): | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 241 |         # make sure agents idle | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 242 |         _status_set = [] | 
 | 243 |         _ready_set = [] | 
 | 244 |         for _agent, _d in self.get_agents_status().items(): | 
 | 245 |             # obviously, there should be some answer | 
 | 246 |             if _d is None: | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 247 |                 logger_cli.error("ERROR: Agent status not available") | 
 | 248 |                 return False | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 249 |             # status should be idle or finished | 
 | 250 |             if _d['status'] not in ["idle", "finished"]: | 
 | 251 |                 logger_cli.error( | 
 | 252 |                     "Agent status invalid {}:{}".format(_agent, _d['status']) | 
 | 253 |                 ) | 
 | 254 |                 _status_set += [False] | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 255 |             else: | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 256 |                 # Good agent | 
 | 257 |                 _status_set += [True] | 
 | 258 |             # agent's fio shell should be in 'ready' | 
 | 259 |             if not _d["healthcheck"]["ready"]: | 
 | 260 |                 logger_cli.error("Agent is not ready {}".format(_agent)) | 
 | 261 |                 _ready_set += [False] | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 262 |             else: | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 263 |                 # 'fio' shell for agent is ready | 
 | 264 |                 _ready_set += [True] | 
 | 265 |         # all agent's statuses should be True | 
 | 266 |         # and all 'fio' shell modules should be 'ready' | 
 | 267 |         if not any(_status_set) or not any(_ready_set): | 
 | 268 |             # At least one is not ready and it was already logged above | 
 | 269 |             return False | 
 | 270 |         else: | 
 | 271 |             # All is good | 
 | 272 |             return True | 
 | 273 |  | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 274 |     def get_agents_status(self, silent=True): | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 275 |         _status = {} | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 276 |         _results = self.master.exec_on_labeled_pods_and_ns( | 
 | 277 |             "app=cfgagent", | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 278 |             "curl -s http://localhost:8765/api/fio", | 
 | 279 |             silent=silent | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 280 |         ) | 
 | 281 |         for _agent, _result in _results.items(): | 
 | 282 |             _j = _parse_json_output(_result) | 
 | 283 |             _status[_agent] = _j | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 284 |         return _status | 
 | 285 |  | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 286 |     @retry(Exception, initial_wait=5) | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 287 |     def get_agents_resultlist(self): | 
 | 288 |         _t = {"module": "fio", "action": "get_resultlist"} | 
 | 289 |         _status = {} | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 290 |         for _agent, _d in self.agent_results.items(): | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 291 |             _status[_agent] = self._poke_agent(_d["url"], _t, action="POST") | 
 | 292 |         return _status | 
 | 293 |  | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 294 |     @retry(Exception, initial_wait=5) | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 295 |     def get_result_from_agent(self, agent, time): | 
 | 296 |         _t = { | 
 | 297 |             "module": "fio", | 
 | 298 |             "action": "get_result", | 
 | 299 |             "options": { | 
 | 300 |                 "time": time | 
 | 301 |             } | 
 | 302 |         } | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 303 |         return self._poke_agent( | 
 | 304 |             self.agent_results[agent]["url"], | 
 | 305 |             _t, | 
 | 306 |             action="POST" | 
 | 307 |         ) | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 308 |  | 
 | 309 |     def _get_next_scheduled_time(self): | 
 | 310 |         _now = datetime.now(timezone.utc) | 
 | 311 |         logger_cli.info("-> time is '{}'".format(_now.strftime(_datetime_fmt))) | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 312 |         self.next_scheduled_time = _now + timedelta( | 
 | 313 |             seconds=self.scheduled_delay | 
 | 314 |         ) | 
 | 315 |         _str_time = self.next_scheduled_time.strftime(_datetime_fmt) | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 316 |         logger_cli.info( | 
 | 317 |             "-> next benchmark scheduled to '{}'".format(_str_time) | 
 | 318 |         ) | 
 | 319 |         return _str_time | 
 | 320 |  | 
 | 321 |     def _send_scheduled_task(self, options): | 
 | 322 |         _task = { | 
 | 323 |             "module": "fio", | 
 | 324 |             "action": "do_scheduledrun", | 
 | 325 |             "options": options | 
 | 326 |         } | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 327 |         for _agent, _d in self.agent_results.items(): | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 328 |             logger_cli.info( | 
 | 329 |                 "-> sending task to '{}:{}'".format(_agent, _d["url"]) | 
 | 330 |             ) | 
 | 331 |             _ret = self._poke_agent(_d["url"], _task, action="POST") | 
 | 332 |             if 'error' in _ret: | 
 | 333 |                 logger_cli.error( | 
 | 334 |                     "ERROR: Agent returned: '{}'".format(_ret['error']) | 
 | 335 |                 ) | 
 | 336 |                 return False | 
 | 337 |         # No errors detected | 
 | 338 |         return True | 
 | 339 |  | 
 | 340 |     def track_benchmark(self, options): | 
 | 341 |         _runtime = _get_seconds(options["runtime"]) | 
 | 342 |         _ramptime = _get_seconds(options["ramp_time"]) | 
 | 343 |         # Sum up all timings that we must wait and double it | 
 | 344 |         _timeout = (self.scheduled_delay + _runtime + _ramptime) * 2 | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 345 |         # We should have no more than 65 measurements | 
 | 346 |         _stats_delay = round((_runtime + _ramptime) / 65) | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 347 |         _start = self.next_scheduled_time | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 348 |         _end = datetime.now(timezone.utc) + timedelta(seconds=_timeout) | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 349 |         logger_cli.info(" ") | 
 | 350 |         tw = cl_typewriter() | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 351 |         while True: | 
 | 352 |             # Print status | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 353 |             tw.cl_start(" ") | 
 | 354 |             _sts = self.get_agents_status(silent=True) | 
 | 355 |             # Use same line | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 356 |             diff = (_end - datetime.now(timezone.utc)).total_seconds() | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 357 |             _startin = (_start - datetime.now(timezone.utc)).total_seconds() | 
 | 358 |             if _startin > 0: | 
 | 359 |                 tw.cl_inline("-> starting in {:.2f}s ".format(_startin)) | 
 | 360 |             else: | 
 | 361 |                 tw.cl_inline("-> {:.2f}s; ".format(diff)) | 
 | 362 |                 _progress = [_st["progress"] for _st in _sts.values()] | 
 | 363 |                 tw.cl_inline( | 
 | 364 |                     "{}% <-> {}%; ".format( | 
 | 365 |                         min(_progress), | 
 | 366 |                         max(_progress) | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 367 |                     ) | 
 | 368 |                 ) | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 369 |  | 
 | 370 |                 _a_sts = [_t["status"] for _t in _sts.values()] | 
 | 371 |                 tw.cl_inline( | 
 | 372 |                     ", ".join( | 
 | 373 |                         ["{} {}".format(_a_sts.count(_s), _s) | 
 | 374 |                          for _s in set(_a_sts)] | 
 | 375 |                     ) | 
 | 376 |                 ) | 
 | 377 |  | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 378 |             # Get Ceph status if _start time passed | 
 | 379 |             _elapsed = (datetime.now(timezone.utc) - _start).total_seconds() | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 380 |             if _elapsed > _stats_delay: | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 381 |                 # Use same line output | 
 | 382 |                 tw.cl_inline(" {:.2f}s elapsed".format(_elapsed)) | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 383 |                 _sec = "{:0.1f}".format(_elapsed) | 
 | 384 |                 self.results[options["scheduled_to"]]["ceph"][_sec] = \ | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 385 |                     self.ceph_info.get_cluster_status() | 
 | 386 |             # Check if agents finished | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 387 |             finished = [True for _s in _sts.values() | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 388 |                         if _s["status"] == 'finished'] | 
 | 389 |             _fcnt = len(finished) | 
 | 390 |             _tcnt = len(_sts) | 
 | 391 |             if _fcnt < _tcnt: | 
| Alex | defbfeb | 2022-11-08 12:17:54 -0600 | [diff] [blame] | 392 |                 tw.cl_inline("; {}/{}".format(_fcnt, _tcnt)) | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 393 |             else: | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 394 |                 tw.cl_flush(newline=True) | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 395 |                 logger_cli.info("-> All agents finished run") | 
 | 396 |                 return True | 
 | 397 |             # recalc how much is left | 
 | 398 |             diff = (_end - datetime.now(timezone.utc)).total_seconds() | 
 | 399 |             # In case end_datetime was in past to begin with | 
 | 400 |             if diff < 0: | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 401 |                 tw.cl_flush(newline=True) | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 402 |                 logger_cli.info("-> Timed out waiting for agents to finish") | 
 | 403 |                 return False | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 404 |             tw.cl_flush() | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 405 |  | 
 | 406 |     def _do_testrun(self, options): | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 407 |         self.results[options["scheduled_to"]]["osd_df_before"] = \ | 
 | 408 |             self.ceph_info.get_ceph_osd_df() | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 409 |         # send single to agent | 
 | 410 |         if not self._send_scheduled_task(options): | 
 | 411 |             return False | 
 | 412 |         # Track this benchmark progress | 
 | 413 |         if not self.track_benchmark(options): | 
 | 414 |             return False | 
 | 415 |         else: | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 416 |             logger_cli.info("-> Finished testrun. Collecting results...") | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 417 |             # get ceph osd stats | 
 | 418 |             self.results[options["scheduled_to"]]["osd_df_after"] = \ | 
 | 419 |                 self.ceph_info.get_ceph_osd_df() | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 420 |             # Get results for each agent | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 421 |             self.collect_results() | 
 | 422 |             logger_cli.info("-> Calculating totals and averages") | 
 | 423 |             self.calculate_totals() | 
 | 424 |             self.calculate_ceph_stats() | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 425 |             self.osd_df_compare(options["scheduled_to"]) | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 426 |             logger_cli.info("-> Dumping results") | 
 | 427 |             for _time, _d in self.results.items(): | 
 | 428 |                 self.dump_result( | 
 | 429 |                     self._get_dump_filename(_time), | 
 | 430 |                     _d | 
 | 431 |                 ) | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 432 |             return True | 
 | 433 |  | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 434 |     def wait_ceph_cooldown(self): | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 435 |         # TODO: Query Ceph ince a 20 sec to make sure its load dropped | 
 | 436 |  | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 437 |         # get ceph idle status | 
 | 438 |         self.ceph_idle_status = self.ceph_info.get_cluster_status() | 
 | 439 |         self.health_detail = self.ceph_info.get_health_detail() | 
 | 440 |         self.ceph_df = self.ceph_info.get_ceph_df() | 
 | 441 |         self.ceph_pg_dump = self.ceph_info.get_ceph_pg_dump() | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 442 |         return | 
 | 443 |  | 
 | 444 |     def run_benchmark(self, options): | 
 | 445 |         logger_cli.info("# Starting '{}' benchmark".format(self.mode)) | 
 | 446 |         # Check agent readyness | 
 | 447 |         logger_cli.info("# Checking agents") | 
 | 448 |         if not self._ensure_agents_ready(): | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 449 |             return False | 
 | 450 |  | 
 | 451 |         # Make sure that Ceph is at low load | 
 | 452 |         # TODO: Ceph status check | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 453 |         # self._wait_ceph_cooldown() | 
 | 454 |  | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 455 |         # Do benchmark according to mode | 
 | 456 |         if self.mode == "tasks": | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 457 |             logger_cli.info( | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 458 |                 "# Running benchmark with tasks from '{}'".format( | 
 | 459 |                     self.taskfile | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 460 |                 ) | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 461 |             ) | 
 | 462 |             # take next task | 
 | 463 |             _total_tasks = len(self.tasks) | 
 | 464 |             for idx in range(_total_tasks): | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 465 |                 # init time to schedule | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 466 |                 _task = self.tasks[idx] | 
| Alex | bdc7274 | 2021-12-23 13:26:05 -0600 | [diff] [blame] | 467 |                 _r = self.results | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 468 |                 logger_cli.info( | 
 | 469 |                     "-> Starting next task ({}/{})".format(idx+1, _total_tasks) | 
 | 470 |                 ) | 
 | 471 |                 logger_cli.info("-> Updating options with: {}".format( | 
 | 472 |                         ", ".join( | 
 | 473 |                             ["{} = {}".format(k, v) for k, v in _task.items()] | 
 | 474 |                         ) | 
 | 475 |                     ) | 
 | 476 |                 ) | 
 | 477 |                 # update options | 
 | 478 |                 options.update(_task) | 
| Alex | bdc7274 | 2021-12-23 13:26:05 -0600 | [diff] [blame] | 479 |                 # Check if such result already exists | 
 | 480 |                 o = "input_options" | 
 | 481 |                 _existing = filter( | 
 | 482 |                     lambda t: | 
 | 483 |                         _r[t]["id"] == idx and | 
 | 484 |                         _r[t]["mode"] == "tasks" and | 
 | 485 |                         _r[t][o]["readwrite"] == options["readwrite"] and | 
 | 486 |                         _r[t][o]["rwmixread"] == options["rwmixread"] and | 
 | 487 |                         _r[t][o]["bs"] == options["bs"] and | 
 | 488 |                         _r[t][o]["iodepth"] == options["iodepth"] and | 
 | 489 |                         _r[t][o]["size"] == options["size"], | 
 | 490 |                     _r | 
 | 491 |                 ) | 
 | 492 |                 if len(list(_existing)) > 0: | 
 | 493 |                     logger_cli.info( | 
 | 494 |                         "-> Skipped already performed task from {}: " | 
 | 495 |                         "line {}, {}({}), {}, {}, {}".format( | 
 | 496 |                             self.taskfile, | 
 | 497 |                             idx, | 
 | 498 |                             options["readwrite"], | 
 | 499 |                             options["rwmixread"], | 
 | 500 |                             options["bs"], | 
 | 501 |                             options["iodepth"], | 
 | 502 |                             options["size"] | 
 | 503 |                         ) | 
 | 504 |                     ) | 
 | 505 |                     continue | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 506 |                 _sch_time = self._get_next_scheduled_time() | 
 | 507 |                 options["scheduled_to"] = _sch_time | 
 | 508 |                 # init results table | 
| Alex | bdc7274 | 2021-12-23 13:26:05 -0600 | [diff] [blame] | 509 |                 _r[_sch_time] = { | 
 | 510 |                     "id": idx, | 
 | 511 |                     "mode": self.mode, | 
 | 512 |                     "input_options": deepcopy(options), | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 513 |                     "agents": {}, | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 514 |                     "ceph": {} | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 515 |                 } | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 516 |                 # exit on error | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 517 |                 if not self._do_testrun(options): | 
 | 518 |                     return False | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 519 |                 # Save ceph osd stats and wait cooldown | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 520 |                 self.wait_ceph_cooldown() | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 521 |         elif self.mode == "single": | 
 | 522 |             logger_cli.info("# Running single benchmark") | 
 | 523 |             # init time to schedule | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 524 |             _sch_time = self._get_next_scheduled_time() | 
 | 525 |             options["scheduled_to"] = _sch_time | 
 | 526 |             # init results table | 
 | 527 |             self.results[_sch_time] = { | 
| Alex | bdc7274 | 2021-12-23 13:26:05 -0600 | [diff] [blame] | 528 |                 "id": "{:2}".format(0), | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 529 |                 "input_options": options, | 
 | 530 |                 "agents": {}, | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 531 |                 "ceph": {} | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 532 |             } | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 533 |             if not self._do_testrun(options): | 
 | 534 |                 return False | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 535 |             # Save ceph osd stats | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 536 |         else: | 
 | 537 |             logger_cli.error("ERROR: Unknown mode '{}'".format(self.mode)) | 
 | 538 |             return False | 
 | 539 |  | 
 | 540 |         # Normal exit | 
 | 541 |         logger_cli.info("# All benchmark tasks done") | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 542 |         return True | 
 | 543 |  | 
 | 544 |     def cleanup(self): | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 545 |         logger_cli.info("# Cleaning up") | 
| Alex | 2a7657c | 2021-11-10 20:51:34 -0600 | [diff] [blame] | 546 |         self.cleanup_list.reverse() | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 547 |  | 
| Alex | 2a7657c | 2021-11-10 20:51:34 -0600 | [diff] [blame] | 548 |         for _res in self.cleanup_list: | 
 | 549 |             self.master.cleanup_resource_by_name(_res[0], _res[2], ns=_res[1]) | 
| Alex | bfa947c | 2021-11-11 18:14:28 -0600 | [diff] [blame] | 550 |  | 
 | 551 |         # Wait for resource to be cleaned | 
 | 552 |         _timeout = 120 | 
 | 553 |         _total = len(self.cleanup_list) | 
 | 554 |         logger_cli.info("-> Wait until {} resources cleaned".format(_total)) | 
 | 555 |         _p = Progress(_total) | 
 | 556 |         while True: | 
 | 557 |             _g = self.master.get_resource_phase_by_name | 
 | 558 |             _l = [_g(r[0], r[2], ns=r[1]) for r in self.cleanup_list] | 
 | 559 |             _l = [item for item in _l if item] | 
 | 560 |             _idx = _total - len(_l) | 
 | 561 |             if len(_l) > 0: | 
 | 562 |                 _p.write_progress(_idx) | 
 | 563 |             else: | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 564 |                 _p.write_progress(_idx) | 
| Alex | bfa947c | 2021-11-11 18:14:28 -0600 | [diff] [blame] | 565 |                 _p.end() | 
 | 566 |                 logger_cli.info("# Done cleaning up") | 
 | 567 |                 break | 
 | 568 |             if _timeout > 0: | 
 | 569 |                 _timeout -= 1 | 
 | 570 |             else: | 
 | 571 |                 _p.end() | 
 | 572 |                 logger_cli.info("# Timed out waiting after 120s.") | 
 | 573 |                 break | 
 | 574 |  | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 575 |         return | 
 | 576 |  | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 577 |     def collect_results(self): | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 578 |         logger_cli.info("# Collecting results") | 
 | 579 |         # query agents for results | 
 | 580 |         _agents = self.get_agents_resultlist() | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 581 |         for _agent, _l in _agents.items(): | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 582 |             # Check if we already have this locally | 
 | 583 |             for _time in _l["resultlist"]: | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 584 |                 # There is a file already for this task/time | 
 | 585 |                 # Check if we need to load it | 
 | 586 |                 if _time not in self.results: | 
 | 587 |                     # Some older results found | 
 | 588 |                     # do not process them | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 589 |                     logger_cli.debug( | 
 | 590 |                         "...skipped old results for '{}'".format(_time) | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 591 |                     ) | 
 | 592 |                     continue | 
 | 593 |                 elif _agent not in self.results[_time]["agents"]: | 
 | 594 |                     # Load result add it locally | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 595 |                     logger_cli.info( | 
 | 596 |                         "-> Getting results for '{}' from '{}'".format( | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 597 |                             _time, | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 598 |                             _agent | 
 | 599 |                         ) | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 600 |                     ) | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 601 |                     _r = self.get_result_from_agent(_agent, _time) | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 602 |                     self.results[_time]["agents"][_agent] = _r[_time] | 
 | 603 |                 else: | 
 | 604 |                     # Should never happen, actually | 
 | 605 |                     logger_cli.info( | 
 | 606 |                         "-> Skipped loaded result for '{}' from '{}'".format( | 
 | 607 |                             _time, | 
 | 608 |                             _agent | 
 | 609 |                         ) | 
 | 610 |                     ) | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 611 |  | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 612 |     def _get_dump_filename(self, _time): | 
 | 613 |         _r = self.results[_time] | 
 | 614 |         _dirname = _r["input_options"]["name"] | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 615 |         _filename = "-".join([ | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 616 |             _reformat_timestr(_time), | 
 | 617 |             "{:02}".format(len(_r["agents"])), | 
 | 618 |             _r["input_options"]["readwrite"], | 
 | 619 |             _r["input_options"]["bs"], | 
 | 620 |             str(_r["input_options"]["iodepth"]), | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 621 |         ]) + ".json" | 
 | 622 |         return os.path.join( | 
 | 623 |             self.results_dump_path, | 
 | 624 |             _dirname, | 
 | 625 |             _filename | 
 | 626 |         ) | 
 | 627 |  | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 628 |     def preload_results(self): | 
 | 629 |         logger_cli.info( | 
 | 630 |             "# Preloading results for '{}'".format(self.bench_name) | 
 | 631 |         ) | 
 | 632 |         # get all dirs in folder | 
 | 633 |         _p = self.results_dump_path | 
 | 634 |         if not os.path.isdir(_p): | 
 | 635 |             logger_cli.warn( | 
 | 636 |                 "WARNING: Dump path is not a folder '{}'".format(_p) | 
 | 637 |             ) | 
 | 638 |             return | 
 | 639 |         for path, dirs, files in os.walk(_p): | 
 | 640 |             if path == os.path.join(_p, self.bench_name): | 
 | 641 |                 logger_cli.info("-> Folder found '{}'".format(path)) | 
 | 642 |                 for _fname in files: | 
 | 643 |                     logger_cli.debug("... processing '{}'".format(_fname)) | 
 | 644 |                     _ext = _fname.split('.')[-1] | 
 | 645 |                     if _ext != "json": | 
 | 646 |                         logger_cli.info( | 
 | 647 |                             "-> Extension invalid '{}', " | 
 | 648 |                             "'json' is expected".format(_ext) | 
 | 649 |                         ) | 
 | 650 |                         continue | 
 | 651 |                     # get time from filename | 
 | 652 |                     # Ugly, but works | 
 | 653 |                     _t = _fname.split('-')[0] | 
 | 654 |                     _str_time = _t[:14] + "+" + _t[14:] | 
 | 655 |                     _t = datetime.strptime(_str_time, _file_datetime_fmt) | 
 | 656 |                     _time = _t.strftime(_datetime_fmt) | 
 | 657 |                     self.results[_time] = self.load_dumped_result( | 
 | 658 |                         os.path.join(path, _fname) | 
 | 659 |                     ) | 
 | 660 |                     logger_cli.info( | 
 | 661 |                         "-> Loaded '{}' as '{}'".format( | 
 | 662 |                             _fname, | 
 | 663 |                             _time | 
 | 664 |                         ) | 
 | 665 |                     ) | 
 | 666 |  | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 667 |     def dump_result(self, filename, data): | 
 | 668 |         # Function dumps all available results as jsons to the given path | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 669 |         # overwriting if needed | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 670 |         _folder, _file = os.path.split(filename) | 
 | 671 |         # Do dump | 
 | 672 |         if not os.path.exists(_folder): | 
 | 673 |             os.mkdir(_folder) | 
 | 674 |             logger_cli.info("-> Created folder '{}'".format(_folder)) | 
 | 675 |         # Dump agent data for this test run | 
 | 676 |         write_str_to_file(filename, json.dumps(data, indent=2)) | 
 | 677 |         logger_cli.info("-> Dumped '{}'".format(filename)) | 
| Alex | 3034ba5 | 2021-11-13 17:06:45 -0600 | [diff] [blame] | 678 |         return | 
 | 679 |  | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 680 |     def load_dumped_result(self, filename): | 
 | 681 |         try: | 
 | 682 |             with open(filename, "rt+") as f: | 
 | 683 |                 return json.loads(f.read()) | 
 | 684 |         except FileNotFoundError as e: | 
 | 685 |             logger_cli.error( | 
 | 686 |                 "ERROR: {}".format(e) | 
 | 687 |             ) | 
 | 688 |         except TypeError as e: | 
 | 689 |             logger_cli.error( | 
 | 690 |                 "ERROR: Invalid file ({}): {}".format(filename, e) | 
 | 691 |             ) | 
 | 692 |         except json.decoder.JSONDecodeError as e: | 
 | 693 |             logger_cli.error( | 
 | 694 |                 "ERROR: Failed to decode json ({}): {}".format(filename, e) | 
 | 695 |             ) | 
 | 696 |         return None | 
 | 697 |  | 
 | 698 |     def _lookup_storage_class_id_by_name(self, storage_class_name): | 
 | 699 |         # Assume that self had proper data | 
 | 700 |         for _pool in self.ceph_df["pools"]: | 
 | 701 |             if storage_class_name == _pool["name"]: | 
 | 702 |                 return _pool["id"] | 
 | 703 |         return None | 
 | 704 |  | 
 | 705 |     def calculate_totals(self): | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 706 |         def _savg(vlist): | 
 | 707 |             if len(vlist) > 0: | 
 | 708 |                 return (sum(vlist) / len(vlist)) / 1000 | 
 | 709 |             else: | 
 | 710 |                 return 0 | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 711 |         # Calculate totals for Read and Write | 
 | 712 |         for _time, data in self.results.items(): | 
 | 713 |             if "totals" not in data: | 
 | 714 |                 data["totals"] = {} | 
 | 715 |             else: | 
 | 716 |                 continue | 
 | 717 |             _totals = data["totals"] | 
 | 718 |             _r_bw = 0 | 
 | 719 |             _r_avglat = [] | 
| Alex | 0989ecf | 2022-03-29 13:43:21 -0500 | [diff] [blame] | 720 |             _r_95clat = [] | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 721 |             _r_iops = 0 | 
 | 722 |             _w_bw = 0 | 
 | 723 |             _w_avglat = [] | 
| Alex | 0989ecf | 2022-03-29 13:43:21 -0500 | [diff] [blame] | 724 |             _w_95clat = [] | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 725 |             _w_iops = 0 | 
 | 726 |             for _a, _d in data["agents"].items(): | 
 | 727 |                 # Hardcoded number of jobs param :( | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 728 |                 _j = _d["jobs"][0] | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 729 |                 _r_bw += _j["read"]["bw_bytes"] | 
 | 730 |                 _r_avglat += [_j["read"]["lat_ns"]["mean"]] | 
 | 731 |                 _r_iops += _j["read"]["iops"] | 
 | 732 |                 _w_bw += _j["write"]["bw_bytes"] | 
 | 733 |                 _w_avglat += [_j["write"]["lat_ns"]["mean"]] | 
 | 734 |                 _w_iops += _j["write"]["iops"] | 
| Alex | defbfeb | 2022-11-08 12:17:54 -0600 | [diff] [blame] | 735 |                 # check for percentiles | 
 | 736 |                 if "percentile" in _j["read"]["clat_ns"]: | 
 | 737 |                     _r_95clat += \ | 
 | 738 |                         [_j["read"]["clat_ns"]["percentile"]["95.000000"]] | 
 | 739 |                 else: | 
 | 740 |                     _r_95clat += [] | 
 | 741 |                 if "percentile" in _j["write"]["clat_ns"]: | 
 | 742 |                     _w_95clat += \ | 
 | 743 |                         [_j["write"]["clat_ns"]["percentile"]["95.000000"]] | 
 | 744 |                 else: | 
 | 745 |                     _w_95clat += [] | 
 | 746 |  | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 747 |                 # Save storage class name | 
 | 748 |                 if "storage_class" not in _totals: | 
 | 749 |                     _totals["storage_class"] = \ | 
 | 750 |                         self.agent_results[_a]["storage_class"] | 
 | 751 |                     # Lookup storage class id and num_pg | 
 | 752 |                     _totals["storage_class_stats"] = \ | 
 | 753 |                         reporter.get_pool_stats_by_id( | 
 | 754 |                             self._lookup_storage_class_id_by_name( | 
 | 755 |                                 self.agent_results[_a]["storage_class"] | 
 | 756 |                             ), | 
 | 757 |                             self.ceph_pg_dump | 
 | 758 |                         ) | 
 | 759 |  | 
 | 760 |             _totals["read_bw_bytes"] = _r_bw | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 761 |             _totals["read_avg_lat_us"] = _savg(_r_avglat) | 
 | 762 |             _totals["read_95p_clat_us"] = _savg(_r_95clat) | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 763 |             _totals["read_iops"] = _r_iops | 
 | 764 |             _totals["write_bw_bytes"] = _w_bw | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 765 |             _totals["write_avg_lat_us"] = _savg(_w_avglat) | 
 | 766 |             _totals["write_95p_clat_us"] = _savg(_w_95clat) | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 767 |             _totals["write_iops"] = _w_iops | 
 | 768 |  | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 769 |     def calculate_ceph_stats(self): | 
 | 770 |         # func to get values as lists | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 771 |         def _get_max_value(key, stats): | 
 | 772 |             _max_time = 0 | 
 | 773 |             _value = 0 | 
 | 774 |             for _k, _v in stats.items(): | 
 | 775 |                 if key in _v and _value < _v[key]: | 
 | 776 |                     _max_time = _k | 
 | 777 |                     _value = _v[key] | 
 | 778 |             return _max_time, _value | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 779 |  | 
 | 780 |         def _perc(n, m): | 
 | 781 |             if not n: | 
 | 782 |                 return 0 | 
 | 783 |             elif not m: | 
 | 784 |                 return 0 | 
 | 785 |             else: | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 786 |                 return "{:.0f}%".format((n / m) * 100) | 
 | 787 |  | 
 | 788 |         def _axis_vals(val): | 
 | 789 |             return [ | 
 | 790 |                 val, int(val*1.1), int(val*0.75), int(val*0.50), int(val*0.15) | 
 | 791 |             ] | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 792 |  | 
 | 793 |         _stats = {} | 
 | 794 |         for _time, data in self.results.items(): | 
 | 795 |             if "ceph" not in data: | 
 | 796 |                 logger_cli.warning( | 
 | 797 |                     "WARNING: Ceph stats raw data not found in results" | 
 | 798 |                 ) | 
 | 799 |                 continue | 
 | 800 |             if "ceph_stats" not in data: | 
 | 801 |                 data["ceph_stats"] = {} | 
 | 802 |             else: | 
 | 803 |                 continue | 
 | 804 |             # Copy pool stats data | 
 | 805 |             for _e, _d in data["ceph"].items(): | 
 | 806 |                 _stats[_e] = _d["pgmap"] | 
 | 807 |             # Maximums | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 808 |             mrb_t, mrb = _get_max_value("read_bytes_sec", _stats) | 
 | 809 |             mwb_t, mwb = _get_max_value("write_bytes_sec", _stats) | 
 | 810 |             mri_t, mri = _get_max_value("read_op_per_sec", _stats) | 
 | 811 |             mwi_t, mwi = _get_max_value("write_op_per_sec", _stats) | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 812 |             # Replace ceph with shorter data | 
 | 813 |             data["ceph"] = { | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 814 |                 "max_rbl": _axis_vals(mrb), | 
 | 815 |                 "max_rbl_time": mrb_t, | 
 | 816 |                 "max_wbl": _axis_vals(mwb), | 
 | 817 |                 "max_wbl_time": mwb_t, | 
 | 818 |                 "max_ril": _axis_vals(mri), | 
 | 819 |                 "max_ril_time": mri_t, | 
 | 820 |                 "max_wil": _axis_vals(mwi), | 
 | 821 |                 "max_wil_time": mwi_t, | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 822 |                 "stats": _stats | 
 | 823 |             } | 
 | 824 |             # Calculate %% values for barchart | 
 | 825 |             for _e, _d in data["ceph"]["stats"].items(): | 
 | 826 |                 _d["read_bytes_sec_perc"] = \ | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 827 |                     _perc(_d.get("read_bytes_sec", 0), mrb) | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 828 |                 _d["write_bytes_sec_perc"] = \ | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 829 |                     _perc(_d.get("write_bytes_sec", 0), mwb) | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 830 |                 _d["read_op_per_sec_perc"] = \ | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 831 |                     _perc(_d.get("read_op_per_sec", 0), mri) | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 832 |                 _d["write_op_per_sec_perc"] = \ | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 833 |                     _perc(_d.get("write_op_per_sec", 0), mwi) | 
 | 834 |         return | 
 | 835 |  | 
 | 836 |     def osd_df_compare(self, _time): | 
 | 837 |         def _get_osd(osd_id, nodes): | 
 | 838 |             for osd in nodes: | 
 | 839 |                 if osd["id"] == osd_id: | 
 | 840 |                     return osd | 
 | 841 |             return None | 
 | 842 |  | 
 | 843 |         logger_cli.info("# Comparing OSD stats") | 
 | 844 |         _osd = {} | 
 | 845 |         if _time not in self.results: | 
 | 846 |             logger_cli.warning("WARNING: {} not found in results. Check data") | 
 | 847 |             return | 
 | 848 |         data = self.results[_time] | 
 | 849 |         # Save summary | 
 | 850 |         data["osd_summary"] = {} | 
 | 851 |         data["osd_summary"]["before"] = data["osd_df_before"]["summary"] | 
 | 852 |         data["osd_summary"]["after"] = data["osd_df_after"]["summary"] | 
 | 853 |         data["osd_summary"]["active"] = { | 
 | 854 |             "status": "", | 
 | 855 |             "device_class": "", | 
| Alex | e4de114 | 2022-11-04 19:26:03 -0500 | [diff] [blame] | 856 |             "pgs": 0, | 
| Alex | 30380a4 | 2021-12-20 16:11:20 -0600 | [diff] [blame] | 857 |             "kb_used": 0, | 
 | 858 |             "kb_used_data": 0, | 
 | 859 |             "kb_used_omap": 0, | 
 | 860 |             "kb_used_meta": 0, | 
 | 861 |             "utilization": 0, | 
 | 862 |             "var_down": 0, | 
 | 863 |             "var_up": 0 | 
 | 864 |         } | 
 | 865 |         # Compare OSD counts | 
 | 866 |         osds_before = len(data["osd_df_before"]["nodes"]) | 
 | 867 |         osds_after = len(data["osd_df_after"]["nodes"]) | 
 | 868 |         if osds_before != osds_after: | 
 | 869 |             logger_cli.warning( | 
 | 870 |                 "WARNING: Before/After bench OSD " | 
 | 871 |                 "count mismatch for '{}'".format(_time) | 
 | 872 |             ) | 
 | 873 |         # iterate osds from before | 
 | 874 |         _pgs = 0 | 
 | 875 |         _classes = set() | 
 | 876 |         _nodes_up = 0 | 
 | 877 |         for idx in range(osds_before): | 
 | 878 |             _osd_b = data["osd_df_before"]["nodes"][idx] | 
 | 879 |             # search for the same osd in after | 
 | 880 |             _osd_a = _get_osd(_osd_b["id"], data["osd_df_after"]["nodes"]) | 
 | 881 |             # Save data to the new place | 
 | 882 |             _osd[_osd_b["name"]] = {} | 
 | 883 |             _osd[_osd_b["name"]]["before"] = _osd_b | 
 | 884 |             if not _osd_a: | 
 | 885 |                 # If this happen, Ceph cluster is actually broken | 
 | 886 |                 logger_cli.warning( | 
 | 887 |                     "WARNING: Wow! {} dissapered".format(_osd_b["name"]) | 
 | 888 |                 ) | 
 | 889 |                 _osd[_osd_b["name"]]["after"] = {} | 
 | 890 |             else: | 
 | 891 |                 _osd[_osd_b["name"]]["after"] = _osd_a | 
 | 892 |             _osd[_osd_b["name"]]["percent"] = {} | 
 | 893 |             # Calculate summary using "after" data | 
 | 894 |             _pgs += _osd_a["pgs"] | 
 | 895 |             _classes.update([_osd_a["device_class"]]) | 
 | 896 |             if _osd_a["status"] == "up": | 
 | 897 |                 _nodes_up += 1 | 
 | 898 |             # compare | 
 | 899 |             _keys_b = list(_osd_b.keys()) | 
 | 900 |             _keys_a = list(_osd_a.keys()) | 
 | 901 |             _nodes_up | 
 | 902 |             # To be safe, detect if some keys are different | 
 | 903 |             # ...and log it. | 
 | 904 |             _diff = set(_keys_b).symmetric_difference(_keys_a) | 
 | 905 |             if len(_diff) > 0: | 
 | 906 |                 # This should never happen, actually | 
 | 907 |                 logger_cli.warning( | 
 | 908 |                     "WARNING: Before/after keys mismatch " | 
 | 909 |                     "for OSD node {}: {}".format(idx, ", ".join(_diff)) | 
 | 910 |                 ) | 
 | 911 |                 continue | 
 | 912 |             # Compare each key and calculate how it changed | 
 | 913 |             for k in _keys_b: | 
 | 914 |                 if _osd_b[k] != _osd_a[k]: | 
 | 915 |                     # Announce change | 
 | 916 |                     logger_cli.debug( | 
 | 917 |                         "-> {:4}: {}, {} -> {}".format( | 
 | 918 |                             idx, | 
 | 919 |                             k, | 
 | 920 |                             _osd_b[k], | 
 | 921 |                             _osd_a[k] | 
 | 922 |                         ) | 
 | 923 |                     ) | 
 | 924 |                     # calculate percent | 
 | 925 |                     _change_perc = (_osd_a[k] / _osd_b[k]) * 100 - 100 | 
 | 926 |                     _osd[_osd_b["name"]]["percent"][k] = _change_perc | 
 | 927 |  | 
 | 928 |                     # Increase counters | 
 | 929 |                     _p = data["osd_summary"]["active"] | 
 | 930 |  | 
 | 931 |                     if k not in _p: | 
 | 932 |                         _p[k] = 1 | 
 | 933 |                     else: | 
 | 934 |                         _p[k] += 1 | 
 | 935 |                     if k == "var": | 
 | 936 |                         if _change_perc > 0: | 
 | 937 |                             _p["var_up"] += 1 | 
 | 938 |                         elif _change_perc < 0: | 
 | 939 |                             _p["var_down"] += 1 | 
 | 940 |         # Save sorted data | 
 | 941 |         data["osds"] = _osd | 
 | 942 |         logger_cli.info("-> Removing redundand osd before/after data") | 
 | 943 |         data.pop("osd_df_before") | 
 | 944 |         data.pop("osd_df_after") | 
 | 945 |         # Save summary | 
 | 946 |         data["osd_summary"]["active"]["status"] = "{}".format(_nodes_up) | 
 | 947 |         data["osd_summary"]["active"]["device_class"] = \ | 
 | 948 |             "{}".format(len(list(_classes))) | 
 | 949 |         data["osd_summary"]["active"]["pgs"] = _pgs | 
| Alex | 90ac153 | 2021-12-09 11:13:14 -0600 | [diff] [blame] | 950 |         return | 
 | 951 |  | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 952 |     # Create report | 
| Alex | 2a7657c | 2021-11-10 20:51:34 -0600 | [diff] [blame] | 953 |     def create_report(self, filename): | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 954 |         """ | 
 | 955 |         Create static html showing ceph info report | 
 | 956 |  | 
 | 957 |         :return: none | 
 | 958 |         """ | 
 | 959 |         logger_cli.info("### Generating report to '{}'".format(filename)) | 
 | 960 |         _report = reporter.ReportToFile( | 
 | 961 |             reporter.HTMLCephBench(self), | 
 | 962 |             filename | 
 | 963 |         ) | 
| Alex | b212954 | 2021-11-23 15:49:42 -0600 | [diff] [blame] | 964 |         _report( | 
 | 965 |             { | 
 | 966 |                 "results": self.results, | 
 | 967 |                 "idle_status": self.ceph_idle_status, | 
 | 968 |                 "health_detail": self.health_detail, | 
 | 969 |                 "ceph_df": self.ceph_df, | 
 | 970 |                 "ceph_pg_dump": self.ceph_pg_dump, | 
 | 971 |                 "info": self.ceph_info.ceph_info, | 
 | 972 |                 "cluster": self.ceph_info.cluster_info, | 
 | 973 |                 "ceph_version": self.ceph_info.ceph_version, | 
 | 974 |                 "nodes": self.agent_pods | 
 | 975 |             } | 
 | 976 |         ) | 
 | 977 |         logger_cli.info("-> Done") | 
| Alex | 5cace3b | 2021-11-10 16:40:37 -0600 | [diff] [blame] | 978 |  | 
 | 979 |         return |