Blame - wally/statistic.py - mcp/cvp-wally

blob: b80fb22941efd809f3de3de5073d5c58d26ea97c [file] [log] [blame]

koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	1	import math
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	2	import logging
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	3	import itertools
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	4	import statistics
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	5	from typing import List, Callable, Iterable, cast
koder aka kdanilov	cff7b2e	2015-04-18 20:48:15 +0300	[diff] [blame]	6
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	7	import numpy
				8	from scipy import stats, optimize
				9	from numpy import linalg
				10	from numpy.polynomial.chebyshev import chebfit, chebval
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	11
				12
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	13	from .result_classes import NormStatProps, HistoStatProps, TimeSeries
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	14	from .utils import Number
koder aka kdanilov	bb6d6cd	2015-06-20 02:55:07 +0300	[diff] [blame]	15
				16
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	17	logger = logging.getLogger("wally")
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	18	DOUBLE_DELTA = 1e-8
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	19	MIN_VALUES_FOR_CONFIDENCE = 7
koder aka kdanilov	e87ae65	2015-04-20 02:14:35 +0300	[diff] [blame]	20
				21
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	22	average = numpy.mean
				23	dev = lambda x: math.sqrt(numpy.var(x, ddof=1))
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	24
				25
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	26	def calc_norm_stat_props(ts: TimeSeries, bins_count: int, confidence: float = 0.95) -> NormStatProps:
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	27	"Calculate statistical properties of array of numbers"
				28
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	29	# array.array has very basic support
				30	data = cast(List[int], ts.data)
				31	res = NormStatProps(data) # type: ignore
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	32
				33	if len(data) == 0:
				34	raise ValueError("Input array is empty")
				35
				36	data = sorted(data)
				37	res.average = average(data)
				38	res.deviation = dev(data)
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	39
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	40	res.max = data[-1]
				41	res.min = data[0]
				42
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	43	res.perc_50, res.perc_90, res.perc_99, res.perc_99 = numpy.percentile(data, q=[50., 90., 95., 99.])
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	44
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	45	if len(data) >= MIN_VALUES_FOR_CONFIDENCE:
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	46	res.confidence = stats.sem(data) * \
				47	stats.t.ppf((1 + confidence) / 2, len(data) - 1)
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	48	res.confidence_level = confidence
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	49	else:
				50	res.confidence = None
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	51	res.confidence_level = None
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	52
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	53	res.bins_populations, bins_edges = numpy.histogram(data, bins=bins_count)
				54	res.bins_mids = (bins_edges[:-1] + bins_edges[1:]) / 2
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	55
				56	try:
				57	res.normtest = stats.mstats.normaltest(data)
				58	except Exception as exc:
				59	logger.warning("stats.mstats.normaltest failed with error: %s", exc)
				60
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	61	res.skew = stats.skew(data)
				62	res.kurt = stats.kurtosis(data)
				63
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	64	return res
				65
				66
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	67	def calc_histo_stat_props(ts: TimeSeries,
				68	bins_edges: numpy.array,
				69	bins_count: int,
				70	min_valuable: float = 0.0001) -> HistoStatProps:
				71	data = numpy.array(ts.data, dtype='int')
				72	data.shape = [len(ts.data) // ts.second_axis_size, ts.second_axis_size] # type: ignore
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	73
				74	res = HistoStatProps(ts.data, ts.second_axis_size)
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	75
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	76	# summ across all series
				77	aggregated = numpy.sum(data, axis=0, dtype='int')
				78	total = numpy.sum(aggregated)
				79
				80	# minimal value used for histo
				81	min_val_on_histo = total * min_valuable
				82
				83	# percentiles levels
				84	expected = [total * 0.5, total * 0.9, total * 0.95, total * 0.99]
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	85	percentiles = []
				86
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	87	# all indexes, where values greater than min_val_on_histo
				88	valuable_idxs = []
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	89
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	90	curr_summ = 0
				91	non_zero = aggregated.nonzero()[0]
				92
				93	# calculate percentiles and valuable_indexes
				94	for idx in non_zero:
				95	val = aggregated[idx]
				96	while expected and curr_summ + val >= expected[0]:
				97	percentiles.append(bins_edges[idx])
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	98	del expected[0]
				99
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	100	curr_summ += val
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	101
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	102	if val >= min_val_on_histo:
				103	valuable_idxs.append(idx)
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	104
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	105	res.perc_50, res.perc_90, res.perc_95, res.perc_99 = percentiles
				106
				107	# minimax and maximal non-zero elements
				108	res.min = bins_edges[aggregated[non_zero[0]]]
				109	res.max = bins_edges[non_zero[-1] + (1 if non_zero[-1] != len(bins_edges) else 0)]
				110
				111	# minimal and maximal valueble evelemts
				112	val_idx_min = valuable_idxs[0]
				113	val_idx_max = valuable_idxs[-1]
				114
				115	raw_bins_populations = aggregated[val_idx_min: val_idx_max + 1]
				116	raw_bins_edges = bins_edges[val_idx_min: val_idx_max + 2]
				117	raw_bins_mids = cast(numpy.array, (raw_bins_edges[1:] + raw_bins_edges[:-1]) / 2)
				118
				119	step = (raw_bins_mids[-1] + raw_bins_mids[0]) / bins_count
				120	next = raw_bins_mids[0]
				121
				122	# aggregate raw histogram with many bins into result histogram with bins_count bins
				123	cidx = 0
				124	bins_populations = []
				125	bins_mids = []
				126
				127	while cidx < len(raw_bins_mids):
				128	next += step
				129	bin_population = 0
				130
				131	while cidx < len(raw_bins_mids) and raw_bins_mids[cidx] <= next:
				132	bin_population += raw_bins_populations[cidx]
				133	cidx += 1
				134
				135	bins_populations.append(bin_population)
				136	bins_mids.append(next - step / 2)
				137
				138	res.bins_populations = numpy.array(bins_populations, dtype='int')
				139	res.bins_mids = numpy.array(bins_mids, dtype='float32')
				140
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	141	return res
				142
				143
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	144	def groupby_globally(data: Iterable, key_func: Callable):
				145	grouped = {} # type: ignore
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	146	grouped_iter = itertools.groupby(data, key_func)
				147
				148	for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
				149	key = (bs, cache_tp, act, conc)
				150	grouped.setdefault(key, []).extend(curr_data_it)
				151
				152	return grouped
				153
				154
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	155	def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]:
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	156	"""returns ynew - y values of some curve approximation"""
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	157	return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef)))
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	158
				159
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	160	def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]:
				161	"""
				162	x, y - test data, xnew - dots, where we want find approximation
				163	if not relative_dist distance = y - newy
				164	returns ynew - y values of linear approximation
				165	"""
				166	ox = numpy.array(x)
				167	oy = numpy.array(y)
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	168
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	169	# set approximation function
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	170	def func_line(tpl, x):
				171	return tpl[0] * x + tpl[1]
				172
				173	def error_func_rel(tpl, x, y):
				174	return 1.0 - y / func_line(tpl, x)
				175
				176	def error_func_abs(tpl, x, y):
				177	return y - func_line(tpl, x)
				178
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	179	# choose distance mode
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	180	error_func = error_func_rel if relative_dist else error_func_abs
				181
				182	tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
				183	oy[:2]))
				184
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	185	# find line
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	186	tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy))
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	187
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	188	# if error
				189	if success not in range(1, 5):
				190	raise ValueError("No line for this dots")
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	191
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	192	# return new dots
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	193	return func_line(tpl_final, numpy.array(xnew))
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	194
				195
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	196	# TODO: revise next
				197	# def difference(y, ynew):
				198	# """returns average and maximum relative and
				199	# absolute differences between y and ynew
				200	# result may contain None values for y = 0
				201	# return value - tuple:
				202	# [(abs dif, rel dif) * len(y)],
				203	# (abs average, abs max),
				204	# (rel average, rel max)"""
				205	#
				206	# abs_dlist = []
				207	# rel_dlist = []
				208	#
				209	# for y1, y2 in zip(y, ynew):
				210	# # absolute
				211	# abs_dlist.append(y1 - y2)
				212	#
				213	# if y1 > 1E-6:
				214	# rel_dlist.append(abs(abs_dlist[-1] / y1))
				215	# else:
				216	# raise ZeroDivisionError("{0!r} is too small".format(y1))
				217	#
				218	# da_avg = sum(abs_dlist) / len(abs_dlist)
				219	# dr_avg = sum(rel_dlist) / len(rel_dlist)
				220	#
				221	# return (zip(abs_dlist, rel_dlist),
				222	# (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
				223	# )