blob: 68170ec6a75ff8c57d6feaa8831416d3798968ee [file] [log] [blame]
import os
import abc
import logging
from io import BytesIO
from functools import wraps
from typing import Dict, Any, Iterator, Tuple, cast, List, Callable, Set, Optional
from collections import defaultdict
import numpy
import scipy.stats
import matplotlib.pyplot as plt
import wally
from . import html
from .stage import Stage, StepOrder
from .test_run_class import TestRun
from .hlstorage import ResultStorage
from .node_interfaces import NodeInfo
from .utils import b2ssize, b2ssize_10, STORAGE_ROLES
from .statistic import (calc_norm_stat_props, calc_histo_stat_props, moving_average, moving_dev,
hist_outliers_perc, ts_hist_outliers_perc, find_ouliers_ts, approximate_curve)
from .result_classes import (StatProps, DataSource, TimeSeries, NormStatProps, HistoStatProps, SuiteConfig,
IResultStorage)
from .suits.io.fio_hist import get_lat_vals, expected_lat_bins
from .suits.io.fio import FioTest, FioJobConfig
from .suits.io.fio_job import FioJobParams
from .suits.job import JobConfig
logger = logging.getLogger("wally")
# ---------------- CONSTS ---------------------------------------------------------------------------------------------
DEBUG = False
LARGE_BLOCKS = 256
MiB2KiB = 1024
MS2S = 1000
# ---------------- PROFILES ------------------------------------------------------------------------------------------
# this is default values, real values is loaded from config
class ColorProfile:
primary_color = 'b'
suppl_color1 = 'teal'
suppl_color2 = 'magenta'
suppl_color3 = 'orange'
box_color = 'y'
err_color = 'red'
noise_alpha = 0.3
subinfo_alpha = 0.7
imshow_colormap = None # type: str
class StyleProfile:
grid = True
tide_layout = True
hist_boxes = 10
hist_lat_boxes = 25
hm_hist_bins_count = 25
min_points_for_dev = 5
dev_range_x = 2.0
dev_perc = 95
point_shape = 'o'
err_point_shape = '*'
avg_range = 20
approx_average = True
curve_approx_level = 6
curve_approx_points = 100
assert avg_range >= min_points_for_dev
# figure size in inches
figsize = (10, 6)
extra_io_spine = True
legend_for_eng = True
heatmap_interpolation = '1d'
heatmap_interpolation_points = 300
outliers_q_nd = 3.0
outliers_hide_q_nd = 4.0
outliers_lat = (0.01, 0.995)
violin_instead_of_box = True
violin_point_count = 30000
heatmap_colorbar = False
min_iops_vs_qd_jobs = 3
units = {
'bw': ("MiBps", MiB2KiB, "bandwith"),
'iops': ("IOPS", 1, "iops"),
'lat': ("ms", 1, "latency")
}
# ---------------- STRUCTS -------------------------------------------------------------------------------------------
# TODO: need to be revised, have to user StatProps fields instead
class StoragePerfSummary:
def __init__(self, name: str) -> None:
self.direct_iops_r_max = 0 # type: int
self.direct_iops_w_max = 0 # type: int
# 64 used instead of 4k to faster feed caches
self.direct_iops_w64_max = 0 # type: int
self.rws4k_10ms = 0 # type: int
self.rws4k_30ms = 0 # type: int
self.rws4k_100ms = 0 # type: int
self.bw_write_max = 0 # type: int
self.bw_read_max = 0 # type: int
self.bw = None # type: float
self.iops = None # type: float
self.lat = None # type: float
self.lat_50 = None # type: float
self.lat_95 = None # type: float
class IOSummary:
def __init__(self,
qd: int,
block_size: int,
nodes_count:int,
bw: NormStatProps,
lat: HistoStatProps) -> None:
self.qd = qd
self.nodes_count = nodes_count
self.block_size = block_size
self.bw = bw
self.lat = lat
# -------------- AGGREGATION AND STAT FUNCTIONS ----------------------------------------------------------------------
def make_iosum(rstorage: ResultStorage, suite: SuiteConfig, job: FioJobConfig) -> IOSummary:
lat = get_aggregated(rstorage, suite, job, "lat")
bins_edges = numpy.array(get_lat_vals(lat.data.shape[1]), dtype='float32') / 1000
io = get_aggregated(rstorage, suite, job, "bw")
return IOSummary(job.qd,
nodes_count=len(suite.nodes_ids),
block_size=job.bsize,
lat=calc_histo_stat_props(lat, bins_edges, rebins_count=StyleProfile.hist_boxes),
bw=calc_norm_stat_props(io, StyleProfile.hist_boxes))
#
# def iter_io_results(rstorage: ResultStorage,
# qds: List[int] = None,
# op_types: List[str] = None,
# sync_types: List[str] = None,
# block_sizes: List[int] = None) -> Iterator[Tuple[TestSuiteConfig, FioJobConfig]]:
#
# for suite in rstorage.iter_suite(FioTest.name):
# for job in rstorage.iter_job(suite):
# fjob = cast(FioJobConfig, job)
# assert int(fjob.vals['numjobs']) == 1
#
# if sync_types is not None and fjob.sync_mode in sync_types:
# continue
#
# if block_sizes is not None and fjob.bsize not in block_sizes:
# continue
#
# if op_types is not None and fjob.op_type not in op_types:
# continue
#
# if qds is not None and fjob.qd not in qds:
# continue
#
# yield suite, fjob
AGG_TAG = 'ALL'
def get_aggregated(rstorage: ResultStorage, suite: SuiteConfig, job: FioJobConfig, metric: str) -> TimeSeries:
tss = list(rstorage.iter_ts(suite, job, metric=metric))
ds = DataSource(suite_id=suite.storage_id,
job_id=job.storage_id,
node_id=AGG_TAG,
sensor='fio',
dev=AGG_TAG,
metric=metric,
tag='csv')
agg_ts = TimeSeries(metric,
raw=None,
source=ds,
data=numpy.zeros(tss[0].data.shape, dtype=tss[0].data.dtype),
times=tss[0].times.copy(),
units=tss[0].units)
for ts in tss:
if metric == 'lat' and (len(ts.data.shape) != 2 or ts.data.shape[1] != expected_lat_bins):
logger.error("Sensor %s.%s on node %s has" +
"shape=%s. Can only process sensors with shape=[X, %s].",
ts.source.dev, ts.source.sensor, ts.source.node_id,
ts.data.shape, expected_lat_bins)
raise ValueError()
if metric != 'lat' and len(ts.data.shape) != 1:
logger.error("Sensor %s.%s on node %s has " +
"shape=%s. Can only process 1D sensors.",
ts.source.dev, ts.source.sensor, ts.source.node_id, ts.data.shape)
raise ValueError()
# TODO: match times on different ts
agg_ts.data += ts.data
return agg_ts
def is_sensor_numarray(sensor: str, metric: str) -> bool:
"""Returns True if sensor provides one-dimension array of numeric values. One number per one measurement."""
return True
LEVEL_SENSORS = {("block-io", "io_queue"),
("system-cpu", "procs_blocked"),
("system-cpu", "procs_queue")}
def is_level_sensor(sensor: str, metric: str) -> bool:
"""Returns True if sensor measure level of any kind, E.g. queue depth."""
return (sensor, metric) in LEVEL_SENSORS
def is_delta_sensor(sensor: str, metric: str) -> bool:
"""Returns True if sensor provides deltas for cumulative value. E.g. io completed in given period"""
return not is_level_sensor(sensor, metric)
def get_sensor_for_time_range(storage: IResultStorage,
node_id: str,
sensor: str,
dev: str,
metric: str,
time_range: Tuple[int, int]) -> numpy.array:
"""Return sensor values for given node for given period. Return per second estimated values array
Raise an error if required range is not full covered by data in storage.
First it finds range of results from sensor, which fully covers requested range.
...."""
ds = DataSource(node_id=node_id, sensor=sensor, dev=dev, metric=metric)
sensor_data = storage.load_sensor(ds)
assert sensor_data.time_units == 'us'
# collected_at is array of pairs (collection_started_at, collection_finished_at)
# extract start time from each pair
collection_start_at = sensor_data.times[::2] # type: numpy.array
MICRO = 1000000
# convert seconds to us
begin = time_range[0] * MICRO
end = time_range[1] * MICRO
if begin < collection_start_at[0] or end > collection_start_at[-1] or end <= begin:
raise AssertionError(("Incorrect data for get_sensor - time_range={!r}, collected_at=[{}, ..., {}]," +
"sensor = {}_{}.{}.{}").format(time_range,
sensor_data.times[0] // MICRO,
sensor_data.times[-1] // MICRO,
node_id, sensor, dev, metric))
pos1, pos2 = numpy.searchsorted(collection_start_at, (begin, end))
# current real data time chunk begin time
edge_it = iter(collection_start_at[pos1 - 1: pos2 + 1])
# current real data value
val_it = iter(sensor_data.data[pos1 - 1: pos2 + 1])
# result array, cumulative value per second
result = numpy.zeros(int(end - begin) // MICRO)
idx = 0
curr_summ = 0
# end of current time slot
results_cell_ends = begin + MICRO
# hack to unify looping
real_data_end = next(edge_it)
while results_cell_ends <= end:
real_data_start = real_data_end
real_data_end = next(edge_it)
real_val_left = next(val_it)
# real data "speed" for interval [real_data_start, real_data_end]
real_val_ps = float(real_val_left) / (real_data_end - real_data_start)
while real_data_end >= results_cell_ends and results_cell_ends <= end:
# part of current real value, which is fit into current result cell
curr_real_chunk = int((results_cell_ends - real_data_start) * real_val_ps)
# calculate rest of real data for next result cell
real_val_left -= curr_real_chunk
result[idx] = curr_summ + curr_real_chunk
idx += 1
curr_summ = 0
# adjust real data start time
real_data_start = results_cell_ends
results_cell_ends += MICRO
# don't lost any real data
curr_summ += real_val_left
return result
# -------------- PLOT HELPERS FUNCTIONS ------------------------------------------------------------------------------
def get_emb_data_svg(plt: Any, format: str = 'svg') -> bytes:
bio = BytesIO()
if format in ('png', 'jpg'):
plt.savefig(bio, format=format)
return bio.getvalue()
elif format == 'svg':
plt.savefig(bio, format='svg')
img_start = "<!-- Created with matplotlib (http://matplotlib.org/) -->"
return bio.getvalue().decode("utf8").split(img_start, 1)[1].encode("utf8")
def provide_plot(func: Callable[..., None]) -> Callable[..., str]:
@wraps(func)
def closure1(storage: ResultStorage,
path: DataSource,
*args, **kwargs) -> str:
fpath = storage.check_plot_file(path)
if not fpath:
format = path.tag.split(".")[-1]
plt.figure(figsize=StyleProfile.figsize)
plt.subplots_adjust(right=0.66)
func(*args, **kwargs)
fpath = storage.put_plot_file(get_emb_data_svg(plt, format=format), path)
logger.debug("Plot %s saved to %r", path, fpath)
plt.clf()
plt.close('all')
return fpath
return closure1
def apply_style(style: StyleProfile, eng: bool = True, no_legend: bool = False) -> None:
if style.grid:
plt.grid(True)
if (style.legend_for_eng or not eng) and not no_legend:
legend_location = "center left"
legend_bbox_to_anchor = (1.03, 0.81)
plt.legend(loc=legend_location, bbox_to_anchor=legend_bbox_to_anchor)
# -------------- PLOT FUNCTIONS --------------------------------------------------------------------------------------
@provide_plot
def plot_hist(title: str, units: str,
prop: StatProps,
colors: Any = ColorProfile,
style: Any = StyleProfile) -> None:
# TODO: unit should came from ts
normed_bins = prop.bins_populations / prop.bins_populations.sum()
bar_width = prop.bins_edges[1] - prop.bins_edges[0]
plt.bar(prop.bins_edges, normed_bins, color=colors.box_color, width=bar_width, label="Real data")
plt.xlabel(units)
plt.ylabel("Value probability")
plt.title(title)
dist_plotted = False
if isinstance(prop, NormStatProps):
nprop = cast(NormStatProps, prop)
stats = scipy.stats.norm(nprop.average, nprop.deviation)
new_edges, step = numpy.linspace(prop.bins_edges[0], prop.bins_edges[-1],
len(prop.bins_edges) * 10, retstep=True)
ypoints = stats.cdf(new_edges) * 11
ypoints = [next - prev for (next, prev) in zip(ypoints[1:], ypoints[:-1])]
xpoints = (new_edges[1:] + new_edges[:-1]) / 2
plt.plot(xpoints, ypoints, color=colors.primary_color, label="Expected from\nnormal\ndistribution")
dist_plotted = True
plt.gca().set_xlim(left=prop.bins_edges[0])
if prop.log_bins:
plt.xscale('log')
apply_style(style, eng=True, no_legend=not dist_plotted)
@provide_plot
def plot_v_over_time(title: str, units: str,
ts: TimeSeries,
plot_avg_dev: bool = True,
colors: Any = ColorProfile, style: Any = StyleProfile) -> None:
min_time = min(ts.times)
# /1000 is us to ms conversion
time_points = numpy.array([(val_time - min_time) / 1000 for val_time in ts.times])
outliers_idxs = find_ouliers_ts(ts.data, cut_range=style.outliers_q_nd)
outliers_4q_idxs = find_ouliers_ts(ts.data, cut_range=style.outliers_hide_q_nd)
normal_idxs = numpy.logical_not(outliers_idxs)
outliers_idxs = outliers_idxs & numpy.logical_not(outliers_4q_idxs)
hidden_outliers_count = numpy.count_nonzero(outliers_4q_idxs)
data = ts.data[normal_idxs]
data_times = time_points[normal_idxs]
outliers = ts.data[outliers_idxs]
outliers_times = time_points[outliers_idxs]
alpha = colors.noise_alpha if plot_avg_dev else 1.0
plt.plot(data_times, data, style.point_shape,
color=colors.primary_color, alpha=alpha, label="Data")
plt.plot(outliers_times, outliers, style.err_point_shape,
color=colors.err_color, label="Outliers")
has_negative_dev = False
plus_minus = "\xb1"
if plot_avg_dev and len(data) < style.avg_range * 2:
logger.warning("Array %r to small to plot average over %s points", title, style.avg_range)
elif plot_avg_dev:
avg_vals = moving_average(data, style.avg_range)
dev_vals = moving_dev(data, style.avg_range)
avg_times = moving_average(data_times, style.avg_range)
if style.approx_average:
avg_vals = approximate_curve(avg_times, avg_vals, avg_times, style.curve_approx_level)
dev_vals = approximate_curve(avg_times, dev_vals, avg_times, style.curve_approx_level)
plt.plot(avg_times, avg_vals, c=colors.suppl_color1, label="Average")
low_vals_dev = avg_vals - dev_vals * style.dev_range_x
hight_vals_dev = avg_vals + dev_vals * style.dev_range_x
if style.dev_range_x - int(style.dev_range_x) < 0.01:
plt.plot(avg_times, low_vals_dev, c=colors.suppl_color2,
label="{}{}*stdev".format(plus_minus, int(style.dev_range_x)))
else:
plt.plot(avg_times, low_vals_dev, c=colors.suppl_color2,
label="{}{}*stdev".format(plus_minus, style.dev_range_x))
plt.plot(avg_times, hight_vals_dev, c=colors.suppl_color2)
has_negative_dev = low_vals_dev.min() < 0
plt.xlim(-5, max(time_points) + 5)
plt.xlabel("Time, seconds from test begin")
plt.ylabel("{}. Average and {}stddev over {} points".format(units, plus_minus, style.avg_range))
plt.title(title)
if has_negative_dev:
plt.gca().set_ylim(bottom=0)
apply_style(style, eng=True)
@provide_plot
def plot_lat_over_time(title: str, ts: TimeSeries, bins_vals: List[int], samples: int = 5,
colors: Any = ColorProfile,
style: Any = StyleProfile) -> None:
min_time = min(ts.times)
times = [int(tm - min_time + 500) // 1000 for tm in ts.times]
ts_len = len(times)
step = ts_len / samples
points = [times[int(i * step + 0.5)] for i in range(samples)]
points.append(times[-1])
bounds = list(zip(points[:-1], points[1:]))
agg_data = []
positions = []
labels = []
for begin, end in bounds:
agg_hist = ts.data[begin:end].sum(axis=0)
if style.violin_instead_of_box:
# cut outliers
idx1, idx2 = hist_outliers_perc(agg_hist, style.outliers_lat)
agg_hist = agg_hist[idx1:idx2]
curr_bins_vals = bins_vals[idx1:idx2]
correct_coef = style.violin_point_count / sum(agg_hist)
if correct_coef > 1:
correct_coef = 1
else:
curr_bins_vals = bins_vals
correct_coef = 1
vals = numpy.empty(shape=(numpy.sum(agg_hist),), dtype='float32')
cidx = 0
non_zero, = agg_hist.nonzero()
for pos in non_zero:
count = int(agg_hist[pos] * correct_coef + 0.5)
if count != 0:
vals[cidx: cidx + count] = curr_bins_vals[pos]
cidx += count
agg_data.append(vals[:cidx])
positions.append((end + begin) / 2)
labels.append(str((end + begin) // 2))
if style.violin_instead_of_box:
patches = plt.violinplot(agg_data,
positions=positions,
showmeans=True,
showmedians=True,
widths=step / 2)
patches['cmeans'].set_color("blue")
patches['cmedians'].set_color("green")
if style.legend_for_eng:
legend_location = "center left"
legend_bbox_to_anchor = (1.03, 0.81)
plt.legend([patches['cmeans'], patches['cmedians']], ["mean", "median"],
loc=legend_location, bbox_to_anchor=legend_bbox_to_anchor)
else:
plt.boxplot(agg_data, 0, '', positions=positions, labels=labels, widths=step / 4)
plt.xlim(min(times), max(times))
plt.xlabel("Time, seconds from test begin, sampled for ~{} seconds".format(int(step)))
plt.ylabel("Latency, ms")
plt.title(title)
apply_style(style, eng=True, no_legend=True)
@provide_plot
def plot_heatmap(title: str,
ts: TimeSeries,
bins_vals: List[int],
colors: Any = ColorProfile,
style: Any = StyleProfile) -> None:
assert len(ts.data.shape) == 2
assert ts.data.shape[1] == len(bins_vals)
total_hist = ts.data.sum(axis=0)
# idx1, idx2 = hist_outliers_perc(total_hist, style.outliers_lat)
idx1, idx2 = ts_hist_outliers_perc(ts.data, bounds_perc=style.outliers_lat)
# don't cut too many bins
min_bins_left = style.hm_hist_bins_count
if idx2 - idx1 < min_bins_left:
missed = min_bins_left - (idx2 - idx1) // 2
idx2 = min(len(total_hist), idx2 + missed)
idx1 = max(0, idx1 - missed)
data = ts.data[:, idx1:idx2]
bins_vals = bins_vals[idx1:idx2]
# don't using rebin_histogram here, as we need apply same bins for many arrays
step = (bins_vals[-1] - bins_vals[0]) / style.hm_hist_bins_count
new_bins_edges = numpy.arange(style.hm_hist_bins_count) * step + bins_vals[0]
bin_mapping = numpy.clip(numpy.searchsorted(new_bins_edges, bins_vals) - 1, 0, len(new_bins_edges) - 1)
# map origin bins ranges to heatmap bins, iterate over rows
cmap = []
for line in data:
curr_bins = [0] * style.hm_hist_bins_count
for idx, count in zip(bin_mapping, line):
curr_bins[idx] += count
cmap.append(curr_bins)
ncmap = numpy.array(cmap)
xmin = 0
xmax = (ts.times[-1] - ts.times[0]) / 1000 + 1
ymin = new_bins_edges[0]
ymax = new_bins_edges[-1]
fig, ax = plt.subplots(figsize=style.figsize)
if style.heatmap_interpolation == '1d':
interpolation = 'none'
res = []
for column in ncmap:
new_x = numpy.linspace(0, len(column), style.heatmap_interpolation_points)
old_x = numpy.arange(len(column)) + 0.5
new_vals = numpy.interp(new_x, old_x, column)
res.append(new_vals)
ncmap = numpy.array(res)
else:
interpolation = style.heatmap_interpolation
ax.imshow(ncmap[:,::-1].T,
interpolation=interpolation,
extent=(xmin, xmax, ymin, ymax),
cmap=colors.imshow_colormap)
ax.set_aspect((xmax - xmin) / (ymax - ymin) * (6 / 9))
ax.set_ylabel("Latency, ms")
ax.set_xlabel("Test time, s")
plt.title(title)
@provide_plot
def io_chart(title: str,
legend: str,
iosums: List[IOSummary],
iops_log_spine: bool = False,
lat_log_spine: bool = False,
colors: Any = ColorProfile,
style: Any = StyleProfile) -> None:
# -------------- MAGIC VALUES ---------------------
# IOPS bar width
width = 0.35
# offset from center of bar to deviation/confidence range indicator
err_x_offset = 0.05
# extra space on top and bottom, comparing to maximal tight layout
extra_y_space = 0.05
# additional spine for BW/IOPS on left side of plot
extra_io_spine_x_offset = -0.1
# extra space on left and right sides
extra_x_space = 0.5
# legend location settings
legend_location = "center left"
legend_bbox_to_anchor = (1.1, 0.81)
# plot box size adjust (only plot, not spines and legend)
plot_box_adjust = {'right': 0.66}
# -------------- END OF MAGIC VALUES ---------------------
block_size = iosums[0].block_size
lc = len(iosums)
xt = list(range(1, lc + 1))
# x coordinate of middle of the bars
xpos = [i - width / 2 for i in xt]
# import matplotlib.gridspec as gridspec
# gs = gridspec.GridSpec(1, 3, width_ratios=[1, 4, 1])
# p1 = plt.subplot(gs[1])
fig, p1 = plt.subplots(figsize=StyleProfile.figsize)
# plot IOPS/BW bars
if block_size >= LARGE_BLOCKS:
iops_primary = False
coef = MiB2KiB
p1.set_ylabel("BW (MiBps)")
else:
iops_primary = True
coef = block_size
p1.set_ylabel("IOPS")
p1.bar(xpos, [iosum.bw.average / coef for iosum in iosums], width=width, color=colors.box_color, label=legend)
# set correct x limits for primary IO spine
min_io = min(iosum.bw.average - iosum.bw.deviation * style.dev_range_x for iosum in iosums)
max_io = max(iosum.bw.average + iosum.bw.deviation * style.dev_range_x for iosum in iosums)
border = (max_io - min_io) * extra_y_space
io_lims = (min_io - border, max_io + border)
p1.set_ylim(io_lims[0] / coef, io_lims[-1] / coef)
# plot deviation and confidence error ranges
err1_legend = err2_legend = None
for pos, iosum in zip(xpos, iosums):
err1_legend = p1.errorbar(pos + width / 2 - err_x_offset,
iosum.bw.average / coef,
iosum.bw.deviation * style.dev_range_x / coef,
alpha=colors.subinfo_alpha,
color=colors.suppl_color1) # 'magenta'
err2_legend = p1.errorbar(pos + width / 2 + err_x_offset,
iosum.bw.average / coef,
iosum.bw.confidence / coef,
alpha=colors.subinfo_alpha,
color=colors.suppl_color2) # 'teal'
if style.grid:
p1.grid(True)
handles1, labels1 = p1.get_legend_handles_labels()
handles1 += [err1_legend, err2_legend]
labels1 += ["{}% dev".format(style.dev_perc),
"{}% conf".format(int(100 * iosums[0].bw.confidence_level))]
# extra y spine for latency on right side
p2 = p1.twinx()
# plot median and 95 perc latency
p2.plot(xt, [iosum.lat.perc_50 for iosum in iosums], label="lat med")
p2.plot(xt, [iosum.lat.perc_95 for iosum in iosums], label="lat 95%")
# limit and label x spine
plt.xlim(extra_x_space, lc + extra_x_space)
plt.xticks(xt, ["{0} * {1}".format(iosum.qd, iosum.nodes_count) for iosum in iosums])
p1.set_xlabel("QD * Test node count")
# apply log scales for X spines, if set
if iops_log_spine:
p1.set_yscale('log')
if lat_log_spine:
p2.set_yscale('log')
# extra y spine for BW/IOPS on left side
if style.extra_io_spine:
p3 = p1.twinx()
if iops_log_spine:
p3.set_yscale('log')
if iops_primary:
p3.set_ylabel("BW (MiBps)")
p3.set_ylim(io_lims[0] / MiB2KiB, io_lims[1] / MiB2KiB)
else:
p3.set_ylabel("IOPS")
p3.set_ylim(io_lims[0] / block_size, io_lims[1] / block_size)
p3.spines["left"].set_position(("axes", extra_io_spine_x_offset))
p3.spines["left"].set_visible(True)
p3.yaxis.set_label_position('left')
p3.yaxis.set_ticks_position('left')
p2.set_ylabel("Latency (ms)")
plt.title(title)
# legend box
handles2, labels2 = p2.get_legend_handles_labels()
plt.legend(handles1 + handles2, labels1 + labels2,
loc=legend_location,
bbox_to_anchor=legend_bbox_to_anchor)
# adjust central box size to fit legend
plt.subplots_adjust(**plot_box_adjust)
apply_style(style, eng=False, no_legend=True)
# -------------------- REPORT HELPERS --------------------------------------------------------------------------------
class HTMLBlock:
data = None # type: str
js_links = [] # type: List[str]
css_links = [] # type: List[str]
order_attr = None # type: Any
def __init__(self, data: str, order_attr: Any = None) -> None:
self.data = data
self.order_attr = order_attr
def __eq__(self, o: object) -> bool:
return o.order_attr == self.order_attr # type: ignore
def __lt__(self, o: object) -> bool:
return o.order_attr > self.order_attr # type: ignore
class Table:
def __init__(self, header: List[str]) -> None:
self.header = header
self.data = []
def add_line(self, values: List[str]) -> None:
self.data.append(values)
def html(self):
return html.table("", self.header, self.data)
class Menu1st:
engineering = "Engineering"
summary = "Summary"
per_job = "Per Job"
class Menu2ndEng:
iops_time = "IOPS(time)"
hist = "IOPS/lat overall histogram"
lat_time = "Lat(time)"
class Menu2ndSumm:
io_lat_qd = "IO & Lat vs QD"
menu_1st_order = [Menu1st.summary, Menu1st.engineering, Menu1st.per_job]
# -------------------- REPORTS --------------------------------------------------------------------------------------
class Reporter(metaclass=abc.ABCMeta):
suite_types = set() # type: Set[str]
@abc.abstractmethod
def get_divs(self, suite: SuiteConfig, storage: ResultStorage) -> Iterator[Tuple[str, str, HTMLBlock]]:
pass
class JobReporter(metaclass=abc.ABCMeta):
suite_type = set() # type: Set[str]
@abc.abstractmethod
def get_divs(self,
suite: SuiteConfig,
job: JobConfig,
storage: ResultStorage) -> Iterator[Tuple[str, str, HTMLBlock]]:
pass
# Main performance report
class PerformanceSummary(Reporter):
"""Aggregated summary fro storage"""
# Main performance report
class IO_QD(Reporter):
"""Creates graph, which show how IOPS and Latency depend on QD"""
suite_types = {'fio'}
def get_divs(self, suite: SuiteConfig, rstorage: ResultStorage) -> Iterator[Tuple[str, str, HTMLBlock]]:
ts_map = defaultdict(list) # type: Dict[FioJobParams, List[Tuple[SuiteConfig, FioJobConfig]]]
str_summary = {} # type: Dict[FioJobParams, List[IOSummary]]
for job in rstorage.iter_job(suite):
fjob = cast(FioJobConfig, job)
fjob_no_qd = cast(FioJobParams, fjob.params.copy(qd=None))
str_summary[fjob_no_qd] = (fjob_no_qd.summary, fjob_no_qd.long_summary)
ts_map[fjob_no_qd].append((suite, fjob))
for tpl, suites_jobs in ts_map.items():
if len(suites_jobs) > StyleProfile.min_iops_vs_qd_jobs:
iosums = [make_iosum(rstorage, suite, job) for suite, job in suites_jobs]
iosums.sort(key=lambda x: x.qd)
summary, summary_long = str_summary[tpl]
ds = DataSource(suite_id=suite.storage_id,
job_id=summary,
node_id=AGG_TAG,
sensor="fio",
dev=AGG_TAG,
metric="io_over_qd",
tag="svg")
title = "IOPS, BW, Lat vs. QD.\n" + summary_long
fpath = io_chart(rstorage, ds, title=title, legend="IOPS/BW", iosums=iosums) # type: str
yield Menu1st.summary, Menu2ndSumm.io_lat_qd, HTMLBlock(html.img(fpath))
# Linearization report
class IOPS_Bsize(Reporter):
"""Creates graphs, which show how IOPS and Latency depend on block size"""
def summ_sensors(rstorage: ResultStorage,
nodes: List[str],
sensor: str,
metric: str,
time_range: Tuple[int, int]) -> Optional[numpy.array]:
res = None # type: Optional[numpy.array]
for node_id in nodes:
for _, groups in rstorage.iter_sensors(node_id=node_id, sensor=sensor, metric=metric):
data = get_sensor_for_time_range(rstorage,
node_id=node_id,
sensor=sensor,
dev=groups['dev'],
metric=metric,
time_range=time_range)
if res is None:
res = data
else:
res += data
return res
# IOPS/latency distribution
class StatInfo(JobReporter):
"""Statistic info for job results"""
suite_types = {'fio'}
def get_divs(self, suite: SuiteConfig, job: JobConfig,
rstorage: ResultStorage) -> Iterator[Tuple[str, str, HTMLBlock]]:
fjob = cast(FioJobConfig, job)
io_sum = make_iosum(rstorage, suite, fjob)
summary_data = [
["Summary", job.params.long_summary],
]
res = html.H2(html.center("Test summary"))
res += html.table("Test info", None, summary_data)
stat_data_headers = ["Name", "Average ~ Dev", "Conf interval", "Mediana", "Mode", "Kurt / Skew", "95%", "99%"]
KB = 1024
bw_data = ["Bandwidth",
"{}Bps ~ {}Bps".format(b2ssize(io_sum.bw.average * KB), b2ssize(io_sum.bw.deviation * KB)),
b2ssize(io_sum.bw.confidence * KB) + "Bps",
b2ssize(io_sum.bw.perc_50 * KB) + "Bps",
"-",
"{:.2f} / {:.2f}".format(io_sum.bw.kurt, io_sum.bw.skew),
b2ssize(io_sum.bw.perc_5 * KB) + "Bps",
b2ssize(io_sum.bw.perc_1 * KB) + "Bps"]
iops_data = ["IOPS",
"{}IOPS ~ {}IOPS".format(b2ssize_10(io_sum.bw.average / fjob.bsize),
b2ssize_10(io_sum.bw.deviation / fjob.bsize)),
b2ssize_10(io_sum.bw.confidence / fjob.bsize) + "IOPS",
b2ssize_10(io_sum.bw.perc_50 / fjob.bsize) + "IOPS",
"-",
"{:.2f} / {:.2f}".format(io_sum.bw.kurt, io_sum.bw.skew),
b2ssize_10(io_sum.bw.perc_5 / fjob.bsize) + "IOPS",
b2ssize_10(io_sum.bw.perc_1 / fjob.bsize) + "IOPS"]
MICRO = 1000000
# latency
lat_data = ["Latency",
"-",
"-",
b2ssize_10(io_sum.bw.perc_50 / MICRO) + "s",
"-",
"-",
b2ssize_10(io_sum.bw.perc_95 / MICRO) + "s",
b2ssize_10(io_sum.bw.perc_99 / MICRO) + "s"]
# sensor usage
stat_data = [iops_data, bw_data, lat_data]
res += html.table("Load stats info", stat_data_headers, stat_data)
resource_headers = ["Resource", "Usage count", "Proportional to work done"]
io_transfered = io_sum.bw.data.sum() * KB
resource_data = [
["IO made", b2ssize_10(io_transfered / KB / fjob.bsize) + "OP", "-"],
["Data transfered", b2ssize(io_transfered) + "B", "-"]
]
storage = rstorage.storage
nodes = storage.load_list(NodeInfo, 'all_nodes') # type: List[NodeInfo]
storage_nodes = [node.node_id for node in nodes if node.roles.intersection(STORAGE_ROLES)]
test_nodes = [node.node_id for node in nodes if "testnode" in node.roles]
trange = (job.reliable_info_range[0] / 1000, job.reliable_info_range[1] / 1000)
ops_done = io_transfered / fjob.bsize / KB
all_metrics = [
("Test nodes net send", 'net-io', 'send_bytes', b2ssize, test_nodes, "B", io_transfered),
("Test nodes net recv", 'net-io', 'recv_bytes', b2ssize, test_nodes, "B", io_transfered),
("Test nodes disk write", 'block-io', 'sectors_written', b2ssize, test_nodes, "B", io_transfered),
("Test nodes disk read", 'block-io', 'sectors_read', b2ssize, test_nodes, "B", io_transfered),
("Test nodes writes", 'block-io', 'writes_completed', b2ssize_10, test_nodes, "OP", ops_done),
("Test nodes reads", 'block-io', 'reads_completed', b2ssize_10, test_nodes, "OP", ops_done),
("Storage nodes net send", 'net-io', 'send_bytes', b2ssize, storage_nodes, "B", io_transfered),
("Storage nodes net recv", 'net-io', 'recv_bytes', b2ssize, storage_nodes, "B", io_transfered),
("Storage nodes disk write", 'block-io', 'sectors_written', b2ssize, storage_nodes, "B", io_transfered),
("Storage nodes disk read", 'block-io', 'sectors_read', b2ssize, storage_nodes, "B", io_transfered),
("Storage nodes writes", 'block-io', 'writes_completed', b2ssize_10, storage_nodes, "OP", ops_done),
("Storage nodes reads", 'block-io', 'reads_completed', b2ssize_10, storage_nodes, "OP", ops_done),
]
all_agg = {}
for descr, sensor, metric, ffunc, nodes, units, denom in all_metrics:
if not nodes:
continue
res_arr = summ_sensors(rstorage, nodes=nodes, sensor=sensor, metric=metric, time_range=trange)
if res_arr is None:
continue
agg = res_arr.sum()
resource_data.append([descr, ffunc(agg) + units, "{:.1f}".format(agg / denom)])
all_agg[descr] = agg
cums = [
("Test nodes writes", "Test nodes reads", "Total test ops", b2ssize_10, "OP", ops_done),
("Storage nodes writes", "Storage nodes reads", "Total storage ops", b2ssize_10, "OP", ops_done),
("Storage nodes disk write", "Storage nodes disk read", "Total storage IO size", b2ssize,
"B", io_transfered),
("Test nodes disk write", "Test nodes disk read", "Total test nodes IO size", b2ssize, "B", io_transfered),
]
for name1, name2, descr, ffunc, units, denom in cums:
if name1 in all_agg and name2 in all_agg:
agg = all_agg[name1] + all_agg[name2]
resource_data.append([descr, ffunc(agg) + units, "{:.1f}".format(agg / denom)])
res += html.table("Resources usage", resource_headers, resource_data)
yield Menu1st.per_job, job.summary, HTMLBlock(res)
# IOPS/latency distribution
class IOHist(JobReporter):
"""IOPS.latency distribution histogram"""
suite_types = {'fio'}
def get_divs(self,
suite: SuiteConfig,
job: JobConfig,
rstorage: ResultStorage) -> Iterator[Tuple[str, str, HTMLBlock]]:
fjob = cast(FioJobConfig, job)
yield Menu1st.per_job, fjob.summary, HTMLBlock(html.H2(html.center("Load histograms")))
agg_lat = get_aggregated(rstorage, suite, fjob, "lat")
bins_edges = numpy.array(get_lat_vals(agg_lat.data.shape[1]), dtype='float32') / 1000 # convert us to ms
lat_stat_prop = calc_histo_stat_props(agg_lat, bins_edges, rebins_count=StyleProfile.hist_lat_boxes)
# import IPython
# IPython.embed()
long_summary = cast(FioJobParams, fjob.params).long_summary
title = "Latency distribution"
units = "ms"
fpath = plot_hist(rstorage, agg_lat.source(tag='hist.svg'), title, units, lat_stat_prop) # type: str
yield Menu1st.per_job, fjob.summary, HTMLBlock(html.img(fpath))
agg_io = get_aggregated(rstorage, suite, fjob, "bw")
if fjob.bsize >= LARGE_BLOCKS:
title = "BW distribution"
units = "MiBps"
agg_io.data //= MiB2KiB
else:
title = "IOPS distribution"
agg_io.data //= fjob.bsize
units = "IOPS"
io_stat_prop = calc_norm_stat_props(agg_io, bins_count=StyleProfile.hist_boxes)
fpath = plot_hist(rstorage, agg_io.source(tag='hist.svg'), title, units, io_stat_prop) # type: str
yield Menu1st.per_job, fjob.summary, HTMLBlock(html.img(fpath))
# IOPS/latency over test time for each job
class IOTime(JobReporter):
"""IOPS/latency during test"""
suite_types = {'fio'}
def get_divs(self,
suite: SuiteConfig,
job: JobConfig,
rstorage: ResultStorage) -> Iterator[Tuple[str, str, HTMLBlock]]:
fjob = cast(FioJobConfig, job)
yield Menu1st.per_job, fjob.summary, HTMLBlock(html.H2(html.center("Load over time")))
agg_io = get_aggregated(rstorage, suite, fjob, "bw")
if fjob.bsize >= LARGE_BLOCKS:
title = "Bandwidth"
units = "MiBps"
agg_io.data //= MiB2KiB
else:
title = "IOPS"
agg_io.data //= fjob.bsize
units = "IOPS"
fpath = plot_v_over_time(rstorage, agg_io.source(tag='ts.svg'), title, units, agg_io) # type: str
yield Menu1st.per_job, fjob.summary, HTMLBlock(html.img(fpath))
agg_lat = get_aggregated(rstorage, suite, fjob, "lat")
bins_edges = numpy.array(get_lat_vals(agg_lat.data.shape[1]), dtype='float32') / 1000
title = "Latency"
fpath = plot_lat_over_time(rstorage, agg_lat.source(tag='ts.svg'), title, agg_lat, bins_edges) # type: str
yield Menu1st.per_job, fjob.summary, HTMLBlock(html.img(fpath))
title = "Latency heatmap"
fpath = plot_heatmap(rstorage, agg_lat.source(tag='hmap.png'), title, agg_lat, bins_edges) # type: str
yield Menu1st.per_job, fjob.summary, HTMLBlock(html.img(fpath))
class ResourceUsage:
def __init__(self, io_r_ops: int, io_w_ops: int, io_r_kb: int, io_w_kb: int) -> None:
self.io_w_ops = io_w_ops
self.io_r_ops = io_r_ops
self.io_w_kb = io_w_kb
self.io_r_kb = io_r_kb
self.cpu_used_user = None # type: int
self.cpu_used_sys = None # type: int
self.cpu_wait_io = None # type: int
self.net_send_packets = None # type: int
self.net_recv_packets = None # type: int
self.net_send_kb = None # type: int
self.net_recv_kb = None # type: int
# Cluster load over test time
class ClusterLoad(JobReporter):
"""IOPS/latency during test"""
# TODO: units should came from sensor
storage_sensors = [
('block-io', 'reads_completed', "Read ops", 'iops'),
('block-io', 'writes_completed', "Write ops", 'iops'),
('block-io', 'sectors_read', "Read kb", 'kb'),
('block-io', 'sectors_written', "Write kb", 'kb'),
]
def get_divs(self,
suite: SuiteConfig,
job: JobConfig,
rstorage: ResultStorage) -> Iterator[Tuple[str, str, HTMLBlock]]:
# split nodes on test and other
storage = rstorage.storage
nodes = storage.load_list(NodeInfo, "all_nodes") # type: List[NodeInfo]
yield Menu1st.per_job, job.summary, HTMLBlock(html.H2(html.center("Cluster load")))
test_nodes = {node.node_id for node in nodes if 'testnode' in node.roles}
cluster_nodes = {node.node_id for node in nodes if 'testnode' not in node.roles}
# convert ms to s
time_range = (job.reliable_info_range[0] // MS2S, job.reliable_info_range[1] // MS2S)
len = time_range[1] - time_range[0]
for sensor, metric, sensor_title, units in self.storage_sensors:
sum_testnode = numpy.zeros((len,))
sum_other = numpy.zeros((len,))
for path, groups in rstorage.iter_sensors(sensor=sensor, metric=metric):
# todo: should return sensor units
data = get_sensor_for_time_range(rstorage,
groups['node_id'],
sensor,
groups['dev'],
metric, time_range)
if groups['node_id'] in test_nodes:
sum_testnode += data
else:
sum_other += data
ds = DataSource(suite_id=suite.storage_id,
job_id=job.storage_id,
node_id="test_nodes",
sensor=sensor,
dev=AGG_TAG,
metric=metric,
tag="ts.svg")
# s to ms
ts = TimeSeries(name="",
times=numpy.arange(*time_range) * MS2S,
data=sum_testnode,
raw=None,
units=units,
time_units="us",
source=ds)
fpath = plot_v_over_time(rstorage, ds, sensor_title, sensor_title, ts=ts) # type: str
yield Menu1st.per_job, job.summary, HTMLBlock(html.img(fpath))
# Ceph cluster summary
class ResourceConsumption(Reporter):
"""Resources consumption report, only text"""
# Node load over test time
class NodeLoad(Reporter):
"""IOPS/latency during test"""
# Ceph cluster summary
class CephClusterSummary(Reporter):
"""IOPS/latency during test"""
# TODO: Ceph operation breakout report
# TODO: Resource consumption for different type of test
# ------------------------------------------ REPORT STAGES -----------------------------------------------------------
class HtmlReportStage(Stage):
priority = StepOrder.REPORT
def run(self, ctx: TestRun) -> None:
rstorage = ResultStorage(ctx.storage)
job_reporters = [StatInfo(), IOTime(), IOHist(), ClusterLoad()] # type: List[JobReporter]
reporters = [IO_QD()] # type: List[Reporter]
# job_reporters = [ClusterLoad()]
# reporters = []
root_dir = os.path.dirname(os.path.dirname(wally.__file__))
doc_templ_path = os.path.join(root_dir, "report_templates/index.html")
report_template = open(doc_templ_path, "rt").read()
css_file_src = os.path.join(root_dir, "report_templates/main.css")
css_file = open(css_file_src, "rt").read()
menu_block = []
content_block = []
link_idx = 0
# matplotlib.rcParams.update(ctx.config.reporting.matplotlib_params.raw())
# ColorProfile.__dict__.update(ctx.config.reporting.colors.raw())
# StyleProfile.__dict__.update(ctx.config.reporting.style.raw())
items = defaultdict(lambda: defaultdict(list)) # type: Dict[str, Dict[str, List[HTMLBlock]]]
# TODO: filter reporters
for suite in rstorage.iter_suite(FioTest.name):
all_jobs = list(rstorage.iter_job(suite))
all_jobs.sort(key=lambda job: job.params)
for job in all_jobs:
for reporter in job_reporters:
for block, item, html in reporter.get_divs(suite, job, rstorage):
items[block][item].append(html)
if DEBUG:
break
for reporter in reporters:
for block, item, html in reporter.get_divs(suite, rstorage):
items[block][item].append(html)
if DEBUG:
break
for idx_1st, menu_1st in enumerate(sorted(items, key=lambda x: menu_1st_order.index(x))):
menu_block.append(
'<a href="#item{}" class="nav-group" data-toggle="collapse" data-parent="#MainMenu">{}</a>'
.format(idx_1st, menu_1st)
)
menu_block.append('<div class="collapse" id="item{}">'.format(idx_1st))
for menu_2nd in sorted(items[menu_1st]):
menu_block.append(' <a href="#content{}" class="nav-group-item">{}</a>'
.format(link_idx, menu_2nd))
content_block.append('<div id="content{}">'.format(link_idx))
content_block.extend(" " + x.data for x in items[menu_1st][menu_2nd])
content_block.append('</div>')
link_idx += 1
menu_block.append('</div>')
report = report_template.replace("{{{menu}}}", ("\n" + " " * 16).join(menu_block))
report = report.replace("{{{content}}}", ("\n" + " " * 16).join(content_block))
report_path = rstorage.put_report(report, "index.html")
rstorage.put_report(css_file, "main.css")
logger.info("Report is stored into %r", report_path)
class ConsoleReportStage(Stage):
priority = StepOrder.REPORT
def run(self, ctx: TestRun) -> None:
# TODO(koder): load data from storage
raise NotImplementedError("...")