Blame - wally/statistic.py - mcp/cvp-wally

blob: e2021e1115570d68c930abb6f282294b38dc48d0 [file] [log] [blame]

koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	1	import math
				2	import itertools
koder aka kdanilov	cff7b2e	2015-04-18 20:48:15 +0300	[diff] [blame]	3
				4	try:
koder aka kdanilov	f86d7af	2015-05-06 04:01:54 +0300	[diff] [blame]	5	from scipy import stats
koder aka kdanilov	cff7b2e	2015-04-18 20:48:15 +0300	[diff] [blame]	6	from numpy import array, linalg
				7	from scipy.optimize import leastsq
				8	from numpy.polynomial.chebyshev import chebfit, chebval
koder aka kdanilov	f86d7af	2015-05-06 04:01:54 +0300	[diff] [blame]	9	no_numpy = False
koder aka kdanilov	cff7b2e	2015-04-18 20:48:15 +0300	[diff] [blame]	10	except ImportError:
				11	no_numpy = True
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	12
				13
koder aka kdanilov	bb6d6cd	2015-06-20 02:55:07 +0300	[diff] [blame]	14	def average(data):
				15	return sum(data) / len(data)
				16
				17
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	18	def med_dev(vals):
koder aka kdanilov	b6be5c5	2016-10-01 01:29:35 +0300	[diff] [blame]	19	if len(vals) == 1:
				20	return vals[0], 0.0
				21
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	22	med = sum(vals) / len(vals)
koder aka kdanilov	b6be5c5	2016-10-01 01:29:35 +0300	[diff] [blame]	23	dev = ((sum(abs(med - i) 2.0 for i in vals) / (len(vals) - 1)) 0.5)
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	24	return med, dev
				25
				26
koder aka kdanilov	e87ae65	2015-04-20 02:14:35 +0300	[diff] [blame]	27	def round_3_digit(val):
				28	return round_deviation((val, val / 10.0))[0]
				29
				30
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	31	def round_deviation(med_dev):
				32	med, dev = med_dev
				33
				34	if dev < 1E-7:
				35	return med_dev
				36
				37	dev_div = 10.0 ** (math.floor(math.log10(dev)) - 1)
				38	dev = int(dev / dev_div) * dev_div
				39	med = int(med / dev_div) * dev_div
koder aka kdanilov	7e0f7cf	2015-05-01 17:24:35 +0300	[diff] [blame]	40	return [type(med_dev[0])(med),
				41	type(med_dev[1])(dev)]
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	42
				43
				44	def groupby_globally(data, key_func):
				45	grouped = {}
				46	grouped_iter = itertools.groupby(data, key_func)
				47
				48	for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
				49	key = (bs, cache_tp, act, conc)
				50	grouped.setdefault(key, []).extend(curr_data_it)
				51
				52	return grouped
				53
				54
				55	def approximate_curve(x, y, xnew, curved_coef):
				56	"""returns ynew - y values of some curve approximation"""
koder aka kdanilov	cff7b2e	2015-04-18 20:48:15 +0300	[diff] [blame]	57	if no_numpy:
				58	return None
				59
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	60	return chebval(xnew, chebfit(x, y, curved_coef))
				61
				62
				63	def approximate_line(x, y, xnew, relative_dist=False):
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	64	""" x, y - test data, xnew - dots, where we want find approximation
				65	if not relative_dist distance = y - newy
				66	returns ynew - y values of linear approximation"""
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	67
koder aka kdanilov	cff7b2e	2015-04-18 20:48:15 +0300	[diff] [blame]	68	if no_numpy:
				69	return None
				70
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	71	# convert to numpy.array (don't work without it)
				72	ox = array(x)
				73	oy = array(y)
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	74
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	75	# set approximation function
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	76	def func_line(tpl, x):
				77	return tpl[0] * x + tpl[1]
				78
				79	def error_func_rel(tpl, x, y):
				80	return 1.0 - y / func_line(tpl, x)
				81
				82	def error_func_abs(tpl, x, y):
				83	return y - func_line(tpl, x)
				84
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	85	# choose distance mode
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	86	error_func = error_func_rel if relative_dist else error_func_abs
				87
				88	tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
				89	oy[:2]))
				90
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	91	# find line
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	92	tpl_final, success = leastsq(error_func,
				93	tpl_initial[:],
				94	args=(ox, oy))
				95
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	96	# if error
				97	if success not in range(1, 5):
				98	raise ValueError("No line for this dots")
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	99
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	100	# return new dots
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	101	return func_line(tpl_final, array(xnew))
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	102
				103
				104	def difference(y, ynew):
				105	"""returns average and maximum relative and
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	106	absolute differences between y and ynew
				107	result may contain None values for y = 0
				108	return value - tuple:
				109	[(abs dif, rel dif) * len(y)],
				110	(abs average, abs max),
				111	(rel average, rel max)"""
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	112
				113	abs_dlist = []
				114	rel_dlist = []
				115
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	116	for y1, y2 in zip(y, ynew):
				117	# absolute
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	118	abs_dlist.append(y1 - y2)
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	119
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	120	if y1 > 1E-6:
				121	rel_dlist.append(abs(abs_dlist[-1] / y1))
				122	else:
				123	raise ZeroDivisionError("{0!r} is too small".format(y1))
				124
				125	da_avg = sum(abs_dlist) / len(abs_dlist)
				126	dr_avg = sum(rel_dlist) / len(rel_dlist)
				127
				128	return (zip(abs_dlist, rel_dlist),
				129	(da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
				130	)
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	131
				132
				133	def calculate_distribution_properties(data):
				134	"""chi, etc"""
				135
				136
koder aka kdanilov	3405201	2015-08-27 18:32:11 +0300	[diff] [blame]	137	def minimal_measurement_count(data, max_diff, req_probability):
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	138	"""
				139	should returns amount of measurements to get results (avg and deviation)
				140	with error less, that max_diff in at least req_probability% cases
				141	"""
koder aka kdanilov	f86d7af	2015-05-06 04:01:54 +0300	[diff] [blame]	142
				143
				144	class StatProps(object):
				145	def __init__(self):
				146	self.average = None
				147	self.mediana = None
				148	self.perc_95 = None
				149	self.perc_5 = None
				150	self.deviation = None
				151	self.confidence = None
				152	self.min = None
				153	self.max = None
koder aka kdanilov	4af1c1d	2015-05-18 15:48:58 +0300	[diff] [blame]	154	self.raw = None
koder aka kdanilov	f86d7af	2015-05-06 04:01:54 +0300	[diff] [blame]	155
				156	def rounded_average_conf(self):
				157	return round_deviation((self.average, self.confidence))
				158
koder aka kdanilov	416b87a	2015-05-12 00:26:04 +0300	[diff] [blame]	159	def rounded_average_dev(self):
				160	return round_deviation((self.average, self.deviation))
				161
				162	def __str__(self):
				163	return "StatProps({0} ~ {1})".format(round_3_digit(self.average),
				164	round_3_digit(self.deviation))
				165
				166	def __repr__(self):
				167	return str(self)
				168
koder aka kdanilov	f86d7af	2015-05-06 04:01:54 +0300	[diff] [blame]	169
				170	def data_property(data, confidence=0.95):
				171	res = StatProps()
				172	if len(data) == 0:
				173	return res
				174
				175	data = sorted(data)
				176	res.average, res.deviation = med_dev(data)
				177	res.max = data[-1]
				178	res.min = data[0]
				179
				180	ln = len(data)
				181	if ln % 2 == 0:
				182	res.mediana = (data[ln / 2] + data[ln / 2 - 1]) / 2
				183	else:
				184	res.mediana = data[ln / 2]
				185
				186	res.perc_95 = data[int((ln - 1) * 0.95)]
				187	res.perc_5 = data[int((ln - 1) * 0.05)]
				188
				189	if not no_numpy and ln >= 3:
				190	res.confidence = stats.sem(data) * \
				191	stats.t.ppf((1 + confidence) / 2, ln - 1)
				192	else:
				193	res.confidence = res.deviation
				194
koder aka kdanilov	4af1c1d	2015-05-18 15:48:58 +0300	[diff] [blame]	195	res.raw = data[:]
koder aka kdanilov	f86d7af	2015-05-06 04:01:54 +0300	[diff] [blame]	196	return res