Blame - wally/statistic.py - mcp/cvp-wally

blob: 22637884cb421d1fce05b789db343d473a25d7b1 [file] [log] [blame]

koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	1	import math
				2	import itertools
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame^]	3	import statistics
				4	from typing import Union, List, TypeVar, Callable, Iterable, Tuple, Any, cast, Dict
koder aka kdanilov	cff7b2e	2015-04-18 20:48:15 +0300	[diff] [blame]	5
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame^]	6	import numpy
				7	from scipy import stats, optimize
				8	from numpy import linalg
				9	from numpy.polynomial.chebyshev import chebfit, chebval
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	10
				11
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame^]	12	from .result_classes import IStorable
koder aka kdanilov	bb6d6cd	2015-06-20 02:55:07 +0300	[diff] [blame]	13
				14
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame^]	15	Number = Union[int, float]
				16	TNumber = TypeVar('TNumber', int, float)
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	17
				18
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame^]	19	DOUBLE_DELTA = 1e-8
koder aka kdanilov	e87ae65	2015-04-20 02:14:35 +0300	[diff] [blame]	20
				21
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame^]	22	average = statistics.mean
				23	dev = statistics.variance
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	24
				25
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame^]	26	class StatProps(IStorable):
				27	"Statistic properties for timeserie"
				28
				29	yaml_tag = 'stat'
				30
				31	def __init__(self, data: List[Number]) -> None:
				32	self.average = None # type: float
				33	self.deviation = None # type: float
				34	self.confidence = None # type: float
				35	self.confidence_level = None # type: float
				36
				37	self.perc_99 = None # type: float
				38	self.perc_95 = None # type: float
				39	self.perc_90 = None # type: float
				40	self.perc_50 = None # type: float
				41
				42	self.min = None # type: Number
				43	self.max = None # type: Number
				44
				45	# bin_center: bin_count
				46	self.histo = None # type: Tuple[List[int], List[float]]
				47	self.data = data
				48
				49	self.normtest = None # type: Any
				50
				51	def __str__(self) -> str:
				52	res = ["StatProps(num_el={}):".format(len(self.data)),
				53	" distr = {0.average} ~ {0.deviation}".format(self),
				54	" confidence({0.confidence_level}) = {0.confidence}".format(self),
				55	" perc50={0.perc50}".format(self),
				56	" perc90={0.perc90}".format(self),
				57	" perc95={0.perc95}".format(self),
				58	" perc95={0.perc99}".format(self),
				59	" range {0.min} {0.max}".format(self),
				60	" nurmtest = {0.nortest}".format(self)]
				61	return "\n".join(res)
				62
				63	def __repr__(self) -> str:
				64	return str(self)
				65
				66	def raw(self) -> Dict[str, Any]:
				67	return self.__dict__.copy()
				68
				69	@classmethod
				70	def fromraw(cls, data: Dict[str, Any]) -> 'StatProps':
				71	res = cls.__new__(cls)
				72	res.__dict__.update(data)
				73	return res
				74
				75
				76	def greater_digit_pos(val: Number) -> int:
				77	return int(math.floor(math.log10(val))) + 1
				78
				79
				80	def round_digits(val: TNumber, num_digits: int = 3) -> TNumber:
				81	pow = 10 ** (greater_digit_pos(val) - num_digits)
				82	return type(val)(int(val / pow) * pow)
				83
				84
				85	def calc_stat_props(data: List[Number], confidence: float = 0.95) -> StatProps:
				86	"Calculate statistical properties of array of numbers"
				87
				88	res = StatProps(data)
				89
				90	if len(data) == 0:
				91	raise ValueError("Input array is empty")
				92
				93	data = sorted(data)
				94	res.average = average(data)
				95	res.deviation = dev(data)
				96	res.max = data[-1]
				97	res.min = data[0]
				98
				99	res.perc_50 = numpy.percentile(data, 50)
				100	res.perc_90 = numpy.percentile(data, 90)
				101	res.perc_95 = numpy.percentile(data, 95)
				102	res.perc_99 = numpy.percentile(data, 99)
				103
				104	if len(data) >= 3:
				105	res.confidence = stats.sem(data) * \
				106	stats.t.ppf((1 + confidence) / 2, len(data) - 1)
				107	else:
				108	res.confidence = None
				109
				110	res.histo = numpy.histogram(data, 'auto')
				111	res.normtest = stats.mstats.normaltest(data)
				112	return res
				113
				114
				115	def groupby_globally(data: Iterable, key_func: Callable):
				116	grouped = {} # type: ignore
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	117	grouped_iter = itertools.groupby(data, key_func)
				118
				119	for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
				120	key = (bs, cache_tp, act, conc)
				121	grouped.setdefault(key, []).extend(curr_data_it)
				122
				123	return grouped
				124
				125
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame^]	126	def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]:
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	127	"""returns ynew - y values of some curve approximation"""
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame^]	128	return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef)))
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	129
				130
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame^]	131	def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]:
				132	"""
				133	x, y - test data, xnew - dots, where we want find approximation
				134	if not relative_dist distance = y - newy
				135	returns ynew - y values of linear approximation
				136	"""
				137	ox = numpy.array(x)
				138	oy = numpy.array(y)
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	139
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	140	# set approximation function
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	141	def func_line(tpl, x):
				142	return tpl[0] * x + tpl[1]
				143
				144	def error_func_rel(tpl, x, y):
				145	return 1.0 - y / func_line(tpl, x)
				146
				147	def error_func_abs(tpl, x, y):
				148	return y - func_line(tpl, x)
				149
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	150	# choose distance mode
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	151	error_func = error_func_rel if relative_dist else error_func_abs
				152
				153	tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
				154	oy[:2]))
				155
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	156	# find line
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame^]	157	tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy))
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	158
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	159	# if error
				160	if success not in range(1, 5):
				161	raise ValueError("No line for this dots")
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	162
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	163	# return new dots
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame^]	164	return func_line(tpl_final, numpy.array(xnew))
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	165
				166
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame^]	167	# TODO: revise next
				168	# def difference(y, ynew):
				169	# """returns average and maximum relative and
				170	# absolute differences between y and ynew
				171	# result may contain None values for y = 0
				172	# return value - tuple:
				173	# [(abs dif, rel dif) * len(y)],
				174	# (abs average, abs max),
				175	# (rel average, rel max)"""
				176	#
				177	# abs_dlist = []
				178	# rel_dlist = []
				179	#
				180	# for y1, y2 in zip(y, ynew):
				181	# # absolute
				182	# abs_dlist.append(y1 - y2)
				183	#
				184	# if y1 > 1E-6:
				185	# rel_dlist.append(abs(abs_dlist[-1] / y1))
				186	# else:
				187	# raise ZeroDivisionError("{0!r} is too small".format(y1))
				188	#
				189	# da_avg = sum(abs_dlist) / len(abs_dlist)
				190	# dr_avg = sum(rel_dlist) / len(rel_dlist)
				191	#
				192	# return (zip(abs_dlist, rel_dlist),
				193	# (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
				194	# )