Blame - wally/statistic.py - mcp/cvp-wally

blob: 259ac69118b38e51a8fc7ae6c71a440b5bb222c4 [file] [log] [blame]

koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	1	import math
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame^]	2	import array
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	3	import logging
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	4	import itertools
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	5	import statistics
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame^]	6	from typing import List, Callable, Iterable, cast
koder aka kdanilov	cff7b2e	2015-04-18 20:48:15 +0300	[diff] [blame]	7
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	8	import numpy
				9	from scipy import stats, optimize
				10	from numpy import linalg
				11	from numpy.polynomial.chebyshev import chebfit, chebval
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	12
				13
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame^]	14	from .result_classes import NormStatProps, HistoStatProps, TimeSeries
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	15	from .utils import Number
koder aka kdanilov	bb6d6cd	2015-06-20 02:55:07 +0300	[diff] [blame]	16
				17
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	18	logger = logging.getLogger("wally")
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	19	DOUBLE_DELTA = 1e-8
koder aka kdanilov	e87ae65	2015-04-20 02:14:35 +0300	[diff] [blame]	20
				21
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	22	average = statistics.mean
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	23	dev = lambda x: math.sqrt(statistics.variance(x))
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	24
				25
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame^]	26	def calc_norm_stat_props(ts: TimeSeries, confidence: float = 0.95) -> NormStatProps:
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	27	"Calculate statistical properties of array of numbers"
				28
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame^]	29	data = ts.data
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	30	res = NormStatProps(data)
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	31
				32	if len(data) == 0:
				33	raise ValueError("Input array is empty")
				34
				35	data = sorted(data)
				36	res.average = average(data)
				37	res.deviation = dev(data)
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	38
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	39	res.max = data[-1]
				40	res.min = data[0]
				41
				42	res.perc_50 = numpy.percentile(data, 50)
				43	res.perc_90 = numpy.percentile(data, 90)
				44	res.perc_95 = numpy.percentile(data, 95)
				45	res.perc_99 = numpy.percentile(data, 99)
				46
				47	if len(data) >= 3:
				48	res.confidence = stats.sem(data) * \
				49	stats.t.ppf((1 + confidence) / 2, len(data) - 1)
				50	else:
				51	res.confidence = None
				52
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	53	res.bin_populations, res.bin_edges = numpy.histogram(data, 'auto')
				54
				55	try:
				56	res.normtest = stats.mstats.normaltest(data)
				57	except Exception as exc:
				58	logger.warning("stats.mstats.normaltest failed with error: %s", exc)
				59
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	60	return res
				61
				62
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame^]	63	def calc_histo_stat_props(ts: TimeSeries) -> HistoStatProps:
				64	data = numpy.array(ts.data)
				65	data.shape = [len(ts.data) // ts.second_axis_size, ts.second_axis_size]
				66
				67	res = HistoStatProps(ts.data, ts.second_axis_size)
				68	aggregated = numpy.sum(data, axis=0)
				69
				70	full_sum = numpy.sum(aggregated)
				71	expected = [full_sum * 0.5, full_sum * 0.9, full_sum * 0.95, full_sum * 0.99]
				72	percentiles = []
				73
				74	val_min = None
				75	val_max = None
				76
				77	for idx, val in enumerate(aggregated):
				78	while expected and full_sum + val >= expected[0]:
				79	percentiles.append(idx)
				80	del expected[0]
				81
				82	full_sum += val
				83
				84	if val != 0:
				85	if val_min is None:
				86	val_min = idx
				87	val_max = idx
				88
				89	res.perc_50, res.perc_90, res.perc_95, res.perc_99 = map(ts.bins_edges.__getitem__, percentiles)
				90	res.min = ts.bins_edges[val_min]
				91	res.max = ts.bins_edges[val_max]
				92	res.bin_populations = aggregated
				93	return res
				94
				95
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	96	def groupby_globally(data: Iterable, key_func: Callable):
				97	grouped = {} # type: ignore
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	98	grouped_iter = itertools.groupby(data, key_func)
				99
				100	for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
				101	key = (bs, cache_tp, act, conc)
				102	grouped.setdefault(key, []).extend(curr_data_it)
				103
				104	return grouped
				105
				106
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	107	def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]:
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	108	"""returns ynew - y values of some curve approximation"""
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	109	return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef)))
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	110
				111
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	112	def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]:
				113	"""
				114	x, y - test data, xnew - dots, where we want find approximation
				115	if not relative_dist distance = y - newy
				116	returns ynew - y values of linear approximation
				117	"""
				118	ox = numpy.array(x)
				119	oy = numpy.array(y)
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	120
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	121	# set approximation function
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	122	def func_line(tpl, x):
				123	return tpl[0] * x + tpl[1]
				124
				125	def error_func_rel(tpl, x, y):
				126	return 1.0 - y / func_line(tpl, x)
				127
				128	def error_func_abs(tpl, x, y):
				129	return y - func_line(tpl, x)
				130
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	131	# choose distance mode
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	132	error_func = error_func_rel if relative_dist else error_func_abs
				133
				134	tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
				135	oy[:2]))
				136
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	137	# find line
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	138	tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy))
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	139
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	140	# if error
				141	if success not in range(1, 5):
				142	raise ValueError("No line for this dots")
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	143
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	144	# return new dots
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	145	return func_line(tpl_final, numpy.array(xnew))
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	146
				147
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	148	# TODO: revise next
				149	# def difference(y, ynew):
				150	# """returns average and maximum relative and
				151	# absolute differences between y and ynew
				152	# result may contain None values for y = 0
				153	# return value - tuple:
				154	# [(abs dif, rel dif) * len(y)],
				155	# (abs average, abs max),
				156	# (rel average, rel max)"""
				157	#
				158	# abs_dlist = []
				159	# rel_dlist = []
				160	#
				161	# for y1, y2 in zip(y, ynew):
				162	# # absolute
				163	# abs_dlist.append(y1 - y2)
				164	#
				165	# if y1 > 1E-6:
				166	# rel_dlist.append(abs(abs_dlist[-1] / y1))
				167	# else:
				168	# raise ZeroDivisionError("{0!r} is too small".format(y1))
				169	#
				170	# da_avg = sum(abs_dlist) / len(abs_dlist)
				171	# dr_avg = sum(rel_dlist) / len(rel_dlist)
				172	#
				173	# return (zip(abs_dlist, rel_dlist),
				174	# (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
				175	# )