Ceph benchmark report polishing and hotfixes
- Ordered taskfile report
- Skipping of already performed tasks
- Visual aids on viewing details and column sizes
- Column desctiptions and notes
Fixes:
- Fixed option overwriting in taskfile mode
- Updated retry operation logging and timing
Related-PROD: PROD-36669
Change-Id: I92c049f0043e45bf032ca15e4fa3260355ee0eed
diff --git a/cfg_checker/common/decorators.py b/cfg_checker/common/decorators.py
index d83e469..a8b6da0 100644
--- a/cfg_checker/common/decorators.py
+++ b/cfg_checker/common/decorators.py
@@ -6,7 +6,7 @@
from cfg_checker.common import logger, logger_cli
-def retry(exceptions, total_tries=5, initial_wait=0.5, backoff_factor=2):
+def retry(exceptions, total_tries=5, initial_wait=1, backoff_factor=2):
"""
calling the decorated function applying an exponential backoff.
Args:
@@ -32,7 +32,7 @@
f.__name___,
total_tries
)
- logger_cli.debug(msg)
+ logger_cli.info(msg)
logger.debug(
msg + "args: {}, kwargs: {}".format(
print_args,
@@ -46,7 +46,7 @@
e,
_delay
)
- logger_cli.debug(msg)
+ logger_cli.info(msg)
logger.debug(
msg + "args: {}, kwargs: {}\n".format(
print_args,
diff --git a/cfg_checker/common/kube_utils.py b/cfg_checker/common/kube_utils.py
index f8c3469..6f303ff 100644
--- a/cfg_checker/common/kube_utils.py
+++ b/cfg_checker/common/kube_utils.py
@@ -609,7 +609,7 @@
)
return _pods
- @retry(ApiException, initial_wait=5)
+ @retry(ApiException, initial_wait=10)
def put_string_buffer_to_pod_as_textfile(
self,
pod_name,
diff --git a/cfg_checker/modules/ceph/bench.py b/cfg_checker/modules/ceph/bench.py
index 0780596..c0877db 100644
--- a/cfg_checker/modules/ceph/bench.py
+++ b/cfg_checker/modules/ceph/bench.py
@@ -2,6 +2,7 @@
import os
import json
+from copy import deepcopy
from datetime import datetime, timedelta, timezone
from cfg_checker.common import logger_cli
@@ -448,6 +449,7 @@
for idx in range(_total_tasks):
# init time to schedule
_task = self.tasks[idx]
+ _r = self.results
logger_cli.info(
"-> Starting next task ({}/{})".format(idx+1, _total_tasks)
)
@@ -459,11 +461,40 @@
)
# update options
options.update(_task)
+ # Check if such result already exists
+ o = "input_options"
+ _existing = filter(
+ lambda t:
+ _r[t]["id"] == idx and
+ _r[t]["mode"] == "tasks" and
+ _r[t][o]["readwrite"] == options["readwrite"] and
+ _r[t][o]["rwmixread"] == options["rwmixread"] and
+ _r[t][o]["bs"] == options["bs"] and
+ _r[t][o]["iodepth"] == options["iodepth"] and
+ _r[t][o]["size"] == options["size"],
+ _r
+ )
+ if len(list(_existing)) > 0:
+ logger_cli.info(
+ "-> Skipped already performed task from {}: "
+ "line {}, {}({}), {}, {}, {}".format(
+ self.taskfile,
+ idx,
+ options["readwrite"],
+ options["rwmixread"],
+ options["bs"],
+ options["iodepth"],
+ options["size"]
+ )
+ )
+ continue
_sch_time = self._get_next_scheduled_time()
options["scheduled_to"] = _sch_time
# init results table
- self.results[_sch_time] = {
- "input_options": options,
+ _r[_sch_time] = {
+ "id": idx,
+ "mode": self.mode,
+ "input_options": deepcopy(options),
"agents": {},
"ceph": {}
}
@@ -479,6 +510,7 @@
options["scheduled_to"] = _sch_time
# init results table
self.results[_sch_time] = {
+ "id": "{:2}".format(0),
"input_options": options,
"agents": {},
"ceph": {}
diff --git a/templates/bar_chart.j2 b/templates/bar_chart.j2
index 8ee1cbf..feb90ad 100644
--- a/templates/bar_chart.j2
+++ b/templates/bar_chart.j2
@@ -9,7 +9,7 @@
display: table-cell;
width: 100%;
height: 100%;
- padding-left: 15px;
+ padding-left: 30px;
}
.bc {
display: table;
diff --git a/templates/ceph_bench_html.j2 b/templates/ceph_bench_html.j2
index 229a553..7bdcfcc 100644
--- a/templates/ceph_bench_html.j2
+++ b/templates/ceph_bench_html.j2
@@ -43,6 +43,9 @@
display: none;
background-color: white;
visibility: hidden;
+ width: 100%;
+ border-style: dashed;
+ border-width: 1px;
}
.collapsable.in {
visibility: visible;
@@ -176,14 +179,16 @@
margin: 1px;
}
.item {
- display: inline-grid;
border-width: 1px;
border-style: solid;
margin: 1px 1px 1px 1px;
padding: 0px 1px 0px 1px;
}
+ .details-wrap { margin-left: 20px;}
.spacer { border-radius: 2px; width: 20px;}
+ .bench_id { border-radius: 10px; width: 50px; text-align: center;}
+ .time { border-radius: 10px; width: 160px; text-align: center;}
.status { border-radius: 10px; width: 120px; text-align: center;}
.health_ok { background-color: #393; color: white;}
.health_error { background-color: #933; color: white;}
@@ -195,7 +200,8 @@
.right { text-align: right;}
.col_shortmessage { min-width: 300px; }
.col_longmessage { width: auto; }
- .col_properties { width: auto;}
+ .col_properties { width: auto; border-radius: 10px;}
+ .col_bench { width: auto; border-radius: 10px;}
.srv_name { width: 300px }
.srv_path { width: 250px }
@@ -341,8 +347,14 @@
background-color: #282;
}
+ .tooltip { border-bottom: 0px dotted black;}
+ .tooltip .tooltiptext {
+ font-size: 0.9em;
+ width: 200px;
+
+ }
.tooltiptext {
- transform: translate(100px);
+ transform: translate(0px, 2px);
}
.console {
@@ -407,10 +419,13 @@
{% macro bench_page(results, id_label) %}
<div id="{{ id_label }}" class="barcontent">
<h5>{{ caller() }}</h5>
+ <div class="note">Graphs in detailed section shows value measured by internal Ceph profiler</div>
+ <div class="note">'All agents' value shows theoretical load calculated client-side</div>
<hr>
<table class="ceph_status">
<tr class="node">
- <td class="status">Time started</td>
+ <td class="bench_id">N</td>
+ <td class="time">Time started</td>
<td class="status">Data point</td>
<td class="col_properties">
<div class="props_group">
@@ -441,7 +456,8 @@
{% set o = dt["input_options"] %}
{% set tstripped = time | tstrip %}
<tr class="node" onclick="toggleClassByID('timing_{{ tstripped }}_data')" id="timing_{{ tstripped }}_button">
- <td class="status">{{ time }}</td>
+ <td class="bench_id">{{ dt["id"] }}</td>
+ <td class="time">{{ time }}</td>
<td class="status">All agents</td>
<td class="col_properties">
<div class="props_group">
@@ -469,6 +485,7 @@
</tr>
{% set c = dt["ceph"] %}
<tr class="collapsable" id="timing_{{ tstripped }}_data"><td colspan=3>
+ <div class="details-wrap">
<div class="inlineheader">Global READ stats, MB/s vs seconds. Measured maximum is <b>{{ c["max_rbl"][0] | to_mb }}</b> MB/sec</div>
<div class="bc-wrap">
<div class="bctimecol">
@@ -579,7 +596,7 @@
{% for agent,ag_result in dt["agents"].items() %}
{% set j = ag_result["jobs"][0] %}
<tr class="agents">
- <td class="status">{{ time }}</td>
+ <td class="time">{{ time }}</td>
<td class="status">{{ agent }}</td>
<td class="col_properties">
<div class="props_group">
@@ -608,6 +625,7 @@
</tr>
{% endfor %}
</tbody></table>
+ </div>
</td></tr>
{% endfor %}
</table>
@@ -618,29 +636,68 @@
{% macro osds_page(results, id_label) %}
<div id="{{ id_label }}" class="barcontent">
<h5>{{ caller() }}</h5>
+ <div class="note">Node counts is the number of nodes with parameter changed at the end of the testrun comparing to start of the testrun</div>
+ <div class="note">Hover over column title for description</div>
<hr>
<table class="ceph_status">
<tr class="node">
- <td class="status">Time started</td>
+ <td class="bench_id">N</td>
+ <td class="time">Time started</td>
<td class="status">Data point</td>
<td class="col_properties">
<div class="osd_props_group">
- <div class="item prop">Status</div>
- <div class="item prop">Class</div>
- <div class="item prop">Weight</div>
- <div class="item pg">PGs</div>
+ <div class="tooltip">
+ <div class="item prop">Status</div>
+ <div class="tooltiptext">OSD nodes with 'up' status</div>
+ </div>
+ <div class="tooltip">
+ <div class="item prop">Class</div>
+ <div class="tooltiptext">OSD device class: 'hdd', 'ssd', 'nvme', etc</div>
+ </div>
+ <div class="tooltip">
+ <div class="item prop">Weight</div>
+ <div class="tooltiptext">The weight of the OSD in the CRUSH map</div>
+ </div>
+ <div class="tooltip">
+ <div class="item pg">PGs</div>
+ <div class="tooltiptext">The number of placement groups in the OSD</div>
+ </div>
</div>
</td>
<td class="col_bench">
<div class="osd_stats_group">
- <div class="item bench">Total, GB</div>
- <div class="item bench">Avail., GB</div>
- <div class="item bench">Used, GB</div>
- <div class="item bench">Data, GB</div>
- <div class="item bench">OMAP, GB</div>
- <div class="item bench">Meta, GB</div>
- <div class="item bench">Utilized, %</div>
- <div class="item bench">Variance, %</div>
+ <div class="tooltip">
+ <div class="item bench">Total, GB</div>
+ <div class="tooltiptext">The total storage capacity of the OSD</div>
+ </div>
+ <div class="tooltip">
+ <div class="item bench">Avail., GB</div>
+ <div class="tooltiptext">The amount of free space available on the OSD.</div>
+ </div>
+ <div class="tooltip">
+ <div class="item bench">Used, GB</div>
+ <div class="tooltiptext">The OSD capacity used</div>
+ </div>
+ <div class="tooltip">
+ <div class="item bench">Data, GB</div>
+ <div class="tooltiptext">The amount of OSD capacity that is used by user data</div>
+ </div>
+ <div class="tooltip">
+ <div class="item bench">OMAP, GB</div>
+ <div class="tooltiptext">An estimate value of the bluefs storage that is being used to store object map (omap) data (key value pairs stored in rocksdb)</div>
+ </div>
+ <div class="tooltip">
+ <div class="item bench">Meta, GB</div>
+ <div class="tooltiptext">The bluefs space allocated, or the value set in the bluestore_bluefs_min parameter, whichever is larger, for internal metadata which is calculated as the total space allocated in bluefs minus the estimated omap data size</div>
+ </div>
+ <div class="tooltip">
+ <div class="item bench">Utilized, %</div>
+ <div class="tooltiptext">The notional percentage of storage used by the OSD</div>
+ </div>
+ <div class="tooltip">
+ <div class="item bench">Variance, %</div>
+ <div class="tooltiptext">The variation above or below average utilization</div>
+ </div>
</div>
</td>
</tr>
@@ -650,7 +707,8 @@
{% set s = dt["osd_summary"]["active"] %}
{% set tstripped = time | tstrip %}
<tr class="node" onclick="toggleClassByID('timing_{{ tstripped }}_osds')" id="timing_{{ tstripped }}_button">
- <td class="status">{{ time }}</td>
+ <td class="bench_id">{{ dt["id"] }}</td>
+ <td class="time">{{ time }}</td>
<td class="status">Active nodes</td>
<td class="col_properties">
<div class="osd_props_group">
@@ -681,7 +739,7 @@
{% set a = n["after"] %}
{% set p = n["percent"] %}
<tr class="agents">
- <td class="status">{{ time }}</td>
+ <td class="time">{{ time }}</td>
<td class="status">{{ osd }}</td>
<td class="col_properties">
<div class="osd_props_group">