blob: 666c753d4573f9f7da78b392b079c2fd8342fc24 [file] [log] [blame]
import os
import pprint
import logging
from typing import cast, Iterator, Tuple, Type, Dict, Optional, List
import numpy
from .suits.job import JobConfig
from .result_classes import SuiteConfig, TimeSeries, DataSource, StatProps, IResultStorage, ArrayData
from .storage import Storage
from .utils import StopTestError
from .suits.all_suits import all_suits
logger = logging.getLogger('wally')
class DB_re:
node_id = r'\d+.\d+.\d+.\d+:\d+'
job_id = r'[-a-zA-Z0-9_]+_\d+'
suite_id = r'[a-z_]+_\d+'
sensor = r'[-a-z_]+'
dev = r'[-a-zA-Z0-9_]+'
tag = r'[a-z_.]+'
metric = r'[a-z_.]+'
class DB_paths:
suite_cfg_r = r'results/{suite_id}\.info\.yml'
job_root = r'results/{suite_id}\.{job_id}/'
job_cfg_r = job_root + r'info\.yml'
# time series, data from load tool, sensor is a tool name
ts_r = job_root + r'{node_id}\.{sensor}\.{metric}\.{tag}'
# statistica data for ts
stat_r = job_root + r'{node_id}\.{sensor}\.{metric}\.stat\.yaml'
# sensor data
sensor_data_r = r'sensors/{node_id}_{sensor}\.{dev}\.{metric}\.{tag}'
sensor_time_r = r'sensors/{node_id}_collected_at\.csv'
report_root = 'report/'
plot_r = r'{suite_id}\.{job_id}/{node_id}\.{sensor}\.{dev}\.{metric}\.{tag}'
job_cfg = job_cfg_r.replace("\\.", '.')
suite_cfg = suite_cfg_r.replace("\\.", '.')
ts = ts_r.replace("\\.", '.')
stat = stat_r.replace("\\.", '.')
sensor_data = sensor_data_r.replace("\\.", '.')
sensor_time = sensor_time_r.replace("\\.", '.')
plot = plot_r.replace("\\.", '.')
DB_rr = {name: r"(?P<{}>{})".format(name, rr)
for name, rr in DB_re.__dict__.items()
if not name.startswith("__")}
def fill_path(path: str, **params) -> str:
for name, val in params.items():
if val is not None:
path = path.replace("{" + name + "}", val)
return path
class ResultStorage(IResultStorage):
# TODO: check that all path components match required patterns
ts_header_size = 64
ts_header_format = "!IIIcc"
ts_arr_tag = 'csv'
csv_file_encoding = 'ascii'
def __init__(self, storage: Storage) -> None:
self.storage = storage
self.cache = {} # type: Dict[str, Tuple[int, int, ArrayData]]
def sync(self) -> None:
self.storage.sync()
# ----------------- SERIALIZATION / DESERIALIZATION -------------------------------------------------------------
def read_headers(self, fd) -> Tuple[str, List[str], List[str], Optional[numpy.ndarray]]:
header = fd.readline().decode(self.csv_file_encoding).rstrip().split(",")
dtype, has_header2, header2_dtype, *ext_header = header
if has_header2 == 'true':
ln = fd.readline().decode(self.csv_file_encoding).strip()
header2 = numpy.fromstring(ln, sep=',', dtype=header2_dtype)
else:
assert has_header2 == 'false', \
"In file {} has_header2 is not true/false, but {!r}".format(fd.name, has_header2)
header2 = None
return dtype, ext_header, header, header2
def load_array(self, path: str) -> ArrayData:
"""
Load array from file, shoult not be called directly
:param path: file path
:return: ArrayData
"""
with self.storage.get_fd(path, "rb") as fd:
fd.seek(0, os.SEEK_SET)
stats = os.fstat(fd.fileno())
if path in self.cache:
size, atime, arr_info = self.cache[path]
if size == stats.st_size and atime == stats.st_atime_ns:
return arr_info
data_dtype, header, _, header2 = self.read_headers(fd)
assert data_dtype == 'uint64', path
dt = fd.read().decode(self.csv_file_encoding).strip()
if len(dt) != 0:
arr = numpy.fromstring(dt.replace("\n", ','), sep=',', dtype=data_dtype)
lines = dt.count("\n") + 1
assert len(set(ln.count(',') for ln in dt.split("\n"))) == 1, \
"Data lines in {!r} have different element count".format(path)
arr.shape = [lines] if lines == arr.size else [lines, -1]
else:
arr = None
arr_data = ArrayData(header, header2, arr)
self.cache[path] = (stats.st_size, stats.st_atime_ns, arr_data)
return arr_data
def put_array(self, path: str, data: numpy.array, header: List[str], header2: numpy.ndarray = None,
append_on_exists: bool = False) -> None:
header = [data.dtype.name] + \
(['false', ''] if header2 is None else ['true', header2.dtype.name]) + \
header
exists = append_on_exists and path in self.storage
vw = data.view().reshape((data.shape[0], 1)) if len(data.shape) == 1 else data
mode = "cb" if not exists else "rb+"
with self.storage.get_fd(path, mode) as fd:
if exists:
data_dtype, _, full_header, curr_header2 = self.read_headers(fd)
assert data_dtype == data.dtype.name, \
"Path {!r}. Passed data type ({!r}) and current data type ({!r}) doesn't match"\
.format(path, data.dtype.name, data_dtype)
assert header == full_header, \
"Path {!r}. Passed header ({!r}) and current header ({!r}) doesn't match"\
.format(path, header, full_header)
assert header2 == curr_header2, \
"Path {!r}. Passed header2 != current header2: {!r}\n{!r}"\
.format(path, header2, curr_header2)
fd.seek(0, os.SEEK_END)
else:
fd.write((",".join(header) + "\n").encode(self.csv_file_encoding))
if header2 is not None:
fd.write((",".join(map(str, header2)) + "\n").encode(self.csv_file_encoding))
numpy.savetxt(fd, vw, delimiter=',', newline="\n", fmt="%lu")
def load_ts(self, ds: DataSource, path: str) -> TimeSeries:
"""
Load time series, generated by fio or other tool, should not be called directly,
use iter_ts istead.
:param ds: data source path
:param path: path in data storage
:return: TimeSeries
"""
(units, time_units), header2, data = self.load_array(path)
times = data[:,0]
ts_data = data[:,1:]
if ts_data.shape[1] == 1:
ts_data.shape = (ts_data.shape[0],)
return TimeSeries("{}.{}".format(ds.dev, ds.sensor),
raw=None,
data=ts_data,
times=times,
source=ds,
units=units,
time_units=time_units,
histo_bins=header2)
def load_sensor_raw(self, ds: DataSource) -> bytes:
path = DB_paths.sensor_data.format(**ds.__dict__)
with self.storage.get_fd(path, "rb") as fd:
return fd.read()
def load_sensor(self, ds: DataSource) -> TimeSeries:
# sensors has no shape
path = DB_paths.sensor_time.format(**ds.__dict__)
collect_header, must_be_none, collected_at = self.load_array(path)
# cut 'collection end' time
# .copy needed to really remove 'collection end' element to make c_interpolate_.. works correctly
collected_at = collected_at[::2].copy()
# there must be no histogram for collected_at
assert must_be_none is None, "Extra header2 {!r} in collect_at file at {!r}".format(must_be_none, path)
assert collect_header == [ds.node_id, 'collected_at', 'us'],\
"Unexpected collect_at header {!r} at {!r}".format(collect_header, path)
assert len(collected_at.shape) == 1, "Collected_at must be 1D at {!r}".format(path)
data_path = DB_paths.sensor_data.format(**ds.__dict__)
data_header, must_be_none, data = self.load_array(data_path)
# there must be no histogram for any sensors
assert must_be_none is None, "Extra header2 {!r} in sensor data file {!r}".format(must_be_none, data_path)
data_units = data_header[2]
assert data_header == [ds.node_id, ds.metric_fqdn, data_units], \
"Unexpected data header {!r} at {!r}".format(data_header, data_path)
assert len(data.shape) == 1, "Sensor data must be 1D at {!r}".format(data_path)
return TimeSeries(ds.metric_fqdn,
raw=None,
data=data,
times=collected_at,
source=ds,
units=data_units,
time_units='us')
# ------------- CHECK DATA IN STORAGE ----------------------------------------------------------------------------
def check_plot_file(self, source: DataSource) -> Optional[str]:
path = DB_paths.plot.format(**source.__dict__)
fpath = self.storage.resolve_raw(DB_paths.report_root + path)
return path if os.path.exists(fpath) else None
# ------------- PUT DATA INTO STORAGE --------------------------------------------------------------------------
def put_or_check_suite(self, suite: SuiteConfig) -> None:
path = DB_paths.suite_cfg.format(suite_id=suite.storage_id)
if path in self.storage:
db_cfg = self.storage.load(SuiteConfig, path)
if db_cfg != suite:
logger.error("Current suite %s config is not equal to found in storage at %s", suite.test_type, path)
logger.debug("Current: \n%s\nStorage:\n%s", pprint.pformat(db_cfg), pprint.pformat(suite))
raise StopTestError()
else:
self.storage.put(suite, path)
def put_job(self, suite: SuiteConfig, job: JobConfig) -> None:
path = DB_paths.job_cfg.format(suite_id=suite.storage_id, job_id=job.storage_id)
self.storage.put(job, path)
def put_ts(self, ts: TimeSeries) -> None:
assert ts.data.dtype == ts.times.dtype, "Data type {!r} != time type {!r}".format(ts.data.dtype, ts.times.dtype)
assert ts.data.dtype.kind == 'u', "Only unsigned ints are accepted"
assert ts.source.tag == self.ts_arr_tag, "Incorrect source tag == {!r}, must be {!r}".format(ts.source.tag,
self.ts_arr_tag)
csv_path = DB_paths.ts.format(**ts.source.__dict__)
header = [ts.units, ts.time_units]
tv = ts.times.view().reshape((-1, 1))
if len(ts.data.shape) == 1:
dv = ts.data.view().reshape((ts.times.shape[0], -1))
else:
dv = ts.data
result = numpy.concatenate((tv, dv), axis=1)
if ts.histo_bins is not None:
self.put_array(csv_path, result, header, header2=ts.histo_bins)
else:
self.put_array(csv_path, result, header)
if ts.raw:
raw_path = DB_paths.ts.format(**ts.source(tag=ts.raw_tag).__dict__)
self.storage.put_raw(ts.raw, raw_path)
def put_extra(self, data: bytes, source: DataSource) -> None:
self.storage.put_raw(data, DB_paths.ts.format(**source.__dict__))
def put_stat(self, data: StatProps, source: DataSource) -> None:
self.storage.put(data, DB_paths.stat.format(**source.__dict__))
# return path to file to be inserted into report
def put_plot_file(self, data: bytes, source: DataSource) -> str:
path = DB_paths.plot.format(**source.__dict__)
self.storage.put_raw(data, DB_paths.report_root + path)
return path
def put_report(self, report: str, name: str) -> str:
return self.storage.put_raw(report.encode(self.csv_file_encoding), DB_paths.report_root + name)
def put_sensor_raw(self, data: bytes, ds: DataSource) -> None:
path = DB_paths.sensor_data.format(**ds.__dict__)
with self.storage.get_fd(path, "cb") as fd:
fd.write(data)
def append_sensor(self, data: numpy.array, ds: DataSource, units: str, histo_bins: numpy.ndarray = None) -> None:
if ds.metric == 'collected_at':
path = DB_paths.sensor_time
metrics_fqn = 'collected_at'
else:
path = DB_paths.sensor_data
metrics_fqn = ds.metric_fqdn
if ds.metric == 'lat':
assert len(data.shape) == 2, "Latency should be histo array"
assert histo_bins is not None, "Latency should have histo bins"
path = path.format(**ds.__dict__)
self.put_array(path, data, [ds.node_id, metrics_fqn, units], header2=histo_bins, append_on_exists=True)
# ------------- GET DATA FROM STORAGE --------------------------------------------------------------------------
def get_stat(self, stat_cls: Type[StatProps], source: DataSource) -> StatProps:
return self.storage.load(stat_cls, DB_paths.stat.format(**source.__dict__))
# ------------- ITER OVER STORAGE ------------------------------------------------------------------------------
def iter_paths(self, path_glob) -> Iterator[Tuple[bool, str, Dict[str, str]]]:
path = path_glob.format(**DB_rr).split("/")
yield from self.storage._iter_paths("", path, {})
def iter_suite(self, suite_type: str = None) -> Iterator[SuiteConfig]:
for is_file, suite_info_path, groups in self.iter_paths(DB_paths.suite_cfg_r):
assert is_file
suite = self.storage.load(SuiteConfig, suite_info_path)
# suite = cast(SuiteConfig, self.storage.load(SuiteConfig, suite_info_path))
assert suite.storage_id == groups['suite_id']
if not suite_type or suite.test_type == suite_type:
yield suite
def iter_job(self, suite: SuiteConfig) -> Iterator[JobConfig]:
job_glob = fill_path(DB_paths.job_cfg_r, suite_id=suite.storage_id)
job_config_cls = all_suits[suite.test_type].job_config_cls
for is_file, path, groups in self.iter_paths(job_glob):
assert is_file
job = cast(JobConfig, self.storage.load(job_config_cls, path))
assert job.storage_id == groups['job_id']
yield job
# iterate over test tool data
def iter_ts(self, suite: SuiteConfig, job: JobConfig, **filters) -> Iterator[TimeSeries]:
filters.update(suite_id=suite.storage_id, job_id=job.storage_id)
ts_glob = fill_path(DB_paths.ts_r, **filters)
for is_file, path, groups in self.iter_paths(ts_glob):
tag = groups["tag"]
if tag != 'csv':
continue
assert is_file
groups = groups.copy()
groups.update(filters)
ds = DataSource(suite_id=suite.storage_id,
job_id=job.storage_id,
node_id=groups["node_id"],
sensor=groups["sensor"],
dev=None,
metric=groups["metric"],
tag=tag)
yield self.load_ts(ds, path)
def iter_sensors(self, node_id: str = None, sensor: str = None, dev: str = None, metric: str = None) -> \
Iterator[Tuple[str, DataSource]]:
vls = dict(node_id=node_id, sensor=sensor, dev=dev, metric=metric)
path = fill_path(DB_paths.sensor_data_r, **vls)
for is_file, path, groups in self.iter_paths(path):
cvls = vls.copy()
cvls.update(groups)
yield path, DataSource(**cvls)