Blame - wally/statistic.py - mcp/cvp-wally

blob: 81806190a94e0bcfa4539c8830aad3112cd1152d [file] [log] [blame]

koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	1	import math
				2	import itertools
koder aka kdanilov	cff7b2e	2015-04-18 20:48:15 +0300	[diff] [blame]	3
				4	try:
koder aka kdanilov	f86d7af	2015-05-06 04:01:54 +0300	[diff] [blame]	5	from scipy import stats
koder aka kdanilov	cff7b2e	2015-04-18 20:48:15 +0300	[diff] [blame]	6	from numpy import array, linalg
				7	from scipy.optimize import leastsq
				8	from numpy.polynomial.chebyshev import chebfit, chebval
koder aka kdanilov	f86d7af	2015-05-06 04:01:54 +0300	[diff] [blame]	9	no_numpy = False
koder aka kdanilov	cff7b2e	2015-04-18 20:48:15 +0300	[diff] [blame]	10	except ImportError:
				11	no_numpy = True
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	12
				13
				14	def med_dev(vals):
				15	med = sum(vals) / len(vals)
				16	dev = ((sum(abs(med - i) 2.0 for i in vals) / len(vals)) 0.5)
				17	return med, dev
				18
				19
koder aka kdanilov	e87ae65	2015-04-20 02:14:35 +0300	[diff] [blame]	20	def round_3_digit(val):
				21	return round_deviation((val, val / 10.0))[0]
				22
				23
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	24	def round_deviation(med_dev):
				25	med, dev = med_dev
				26
				27	if dev < 1E-7:
				28	return med_dev
				29
				30	dev_div = 10.0 ** (math.floor(math.log10(dev)) - 1)
				31	dev = int(dev / dev_div) * dev_div
				32	med = int(med / dev_div) * dev_div
koder aka kdanilov	7e0f7cf	2015-05-01 17:24:35 +0300	[diff] [blame]	33	return [type(med_dev[0])(med),
				34	type(med_dev[1])(dev)]
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	35
				36
				37	def groupby_globally(data, key_func):
				38	grouped = {}
				39	grouped_iter = itertools.groupby(data, key_func)
				40
				41	for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
				42	key = (bs, cache_tp, act, conc)
				43	grouped.setdefault(key, []).extend(curr_data_it)
				44
				45	return grouped
				46
				47
				48	def approximate_curve(x, y, xnew, curved_coef):
				49	"""returns ynew - y values of some curve approximation"""
koder aka kdanilov	cff7b2e	2015-04-18 20:48:15 +0300	[diff] [blame]	50	if no_numpy:
				51	return None
				52
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	53	return chebval(xnew, chebfit(x, y, curved_coef))
				54
				55
				56	def approximate_line(x, y, xnew, relative_dist=False):
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	57	""" x, y - test data, xnew - dots, where we want find approximation
				58	if not relative_dist distance = y - newy
				59	returns ynew - y values of linear approximation"""
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	60
koder aka kdanilov	cff7b2e	2015-04-18 20:48:15 +0300	[diff] [blame]	61	if no_numpy:
				62	return None
				63
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	64	# convert to numpy.array (don't work without it)
				65	ox = array(x)
				66	oy = array(y)
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	67
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	68	# set approximation function
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	69	def func_line(tpl, x):
				70	return tpl[0] * x + tpl[1]
				71
				72	def error_func_rel(tpl, x, y):
				73	return 1.0 - y / func_line(tpl, x)
				74
				75	def error_func_abs(tpl, x, y):
				76	return y - func_line(tpl, x)
				77
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	78	# choose distance mode
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	79	error_func = error_func_rel if relative_dist else error_func_abs
				80
				81	tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
				82	oy[:2]))
				83
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	84	# find line
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	85	tpl_final, success = leastsq(error_func,
				86	tpl_initial[:],
				87	args=(ox, oy))
				88
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	89	# if error
				90	if success not in range(1, 5):
				91	raise ValueError("No line for this dots")
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	92
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	93	# return new dots
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	94	return func_line(tpl_final, array(xnew))
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	95
				96
				97	def difference(y, ynew):
				98	"""returns average and maximum relative and
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	99	absolute differences between y and ynew
				100	result may contain None values for y = 0
				101	return value - tuple:
				102	[(abs dif, rel dif) * len(y)],
				103	(abs average, abs max),
				104	(rel average, rel max)"""
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	105
				106	abs_dlist = []
				107	rel_dlist = []
				108
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	109	for y1, y2 in zip(y, ynew):
				110	# absolute
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	111	abs_dlist.append(y1 - y2)
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	112
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	113	if y1 > 1E-6:
				114	rel_dlist.append(abs(abs_dlist[-1] / y1))
				115	else:
				116	raise ZeroDivisionError("{0!r} is too small".format(y1))
				117
				118	da_avg = sum(abs_dlist) / len(abs_dlist)
				119	dr_avg = sum(rel_dlist) / len(rel_dlist)
				120
				121	return (zip(abs_dlist, rel_dlist),
				122	(da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
				123	)
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	124
				125
				126	def calculate_distribution_properties(data):
				127	"""chi, etc"""
				128
				129
				130	def minimal_measurement_amount(data, max_diff, req_probability):
				131	"""
				132	should returns amount of measurements to get results (avg and deviation)
				133	with error less, that max_diff in at least req_probability% cases
				134	"""
koder aka kdanilov	f86d7af	2015-05-06 04:01:54 +0300	[diff] [blame]	135
				136
				137	class StatProps(object):
				138	def __init__(self):
				139	self.average = None
				140	self.mediana = None
				141	self.perc_95 = None
				142	self.perc_5 = None
				143	self.deviation = None
				144	self.confidence = None
				145	self.min = None
				146	self.max = None
				147
				148	def rounded_average_conf(self):
				149	return round_deviation((self.average, self.confidence))
				150
koder aka kdanilov	416b87a	2015-05-12 00:26:04 +0300	[diff] [blame]	151	def rounded_average_dev(self):
				152	return round_deviation((self.average, self.deviation))
				153
				154	def __str__(self):
				155	return "StatProps({0} ~ {1})".format(round_3_digit(self.average),
				156	round_3_digit(self.deviation))
				157
				158	def __repr__(self):
				159	return str(self)
				160
koder aka kdanilov	f86d7af	2015-05-06 04:01:54 +0300	[diff] [blame]	161
				162	def data_property(data, confidence=0.95):
				163	res = StatProps()
				164	if len(data) == 0:
				165	return res
				166
				167	data = sorted(data)
				168	res.average, res.deviation = med_dev(data)
				169	res.max = data[-1]
				170	res.min = data[0]
				171
				172	ln = len(data)
				173	if ln % 2 == 0:
				174	res.mediana = (data[ln / 2] + data[ln / 2 - 1]) / 2
				175	else:
				176	res.mediana = data[ln / 2]
				177
				178	res.perc_95 = data[int((ln - 1) * 0.95)]
				179	res.perc_5 = data[int((ln - 1) * 0.05)]
				180
				181	if not no_numpy and ln >= 3:
				182	res.confidence = stats.sem(data) * \
				183	stats.t.ppf((1 + confidence) / 2, ln - 1)
				184	else:
				185	res.confidence = res.deviation
				186
				187	return res