wally/statistic.py - mcp/cvp-wally - Gitiles

 import math
 import itertools

 try:
     from scipy import stats
     from numpy import array, linalg
     from scipy.optimize import leastsq
     from numpy.polynomial.chebyshev import chebfit, chebval
     no_numpy = False
 except ImportError:
     no_numpy = True


 def average(data):
     return sum(data) / len(data)


 def med_dev(vals):
     med = sum(vals) / len(vals)
     dev = ((sum(abs(med - i) ** 2.0 for i in vals) / len(vals)) ** 0.5)
     return med, dev


 def round_3_digit(val):
     return round_deviation((val, val / 10.0))[0]


 def round_deviation(med_dev):
     med, dev = med_dev

     if dev < 1E-7:
         return med_dev

     dev_div = 10.0 ** (math.floor(math.log10(dev)) - 1)
     dev = int(dev / dev_div) * dev_div
     med = int(med / dev_div) * dev_div
     return [type(med_dev[0])(med),
             type(med_dev[1])(dev)]


 def groupby_globally(data, key_func):
     grouped = {}
     grouped_iter = itertools.groupby(data, key_func)

     for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
         key = (bs, cache_tp, act, conc)
         grouped.setdefault(key, []).extend(curr_data_it)

     return grouped


 def approximate_curve(x, y, xnew, curved_coef):
     """returns ynew - y values of some curve approximation"""
     if no_numpy:
         return None

     return chebval(xnew, chebfit(x, y, curved_coef))


 def approximate_line(x, y, xnew, relative_dist=False):
     """ x, y - test data, xnew - dots, where we want find approximation
         if not relative_dist distance = y - newy
         returns ynew - y values of linear approximation"""

     if no_numpy:
         return None

     # convert to numpy.array (don't work without it)
     ox = array(x)
     oy = array(y)

     # set approximation function
     def func_line(tpl, x):
         return tpl[0] * x + tpl[1]

     def error_func_rel(tpl, x, y):
         return 1.0 - y / func_line(tpl, x)

     def error_func_abs(tpl, x, y):
         return y - func_line(tpl, x)

     # choose distance mode
     error_func = error_func_rel if relative_dist else error_func_abs

     tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
                                      oy[:2]))

     # find line
     tpl_final, success = leastsq(error_func,
                                  tpl_initial[:],
                                  args=(ox, oy))

     # if error
     if success not in range(1, 5):
         raise ValueError("No line for this dots")

     # return new dots
     return func_line(tpl_final, array(xnew))


 def difference(y, ynew):
     """returns average and maximum relative and
        absolute differences between y and ynew
        result may contain None values for y = 0
        return value - tuple:
        [(abs dif, rel dif) * len(y)],
        (abs average, abs max),
        (rel average, rel max)"""

     abs_dlist = []
     rel_dlist = []

     for y1, y2 in zip(y, ynew):
         # absolute
         abs_dlist.append(y1 - y2)

         if y1 > 1E-6:
             rel_dlist.append(abs(abs_dlist[-1] / y1))
         else:
             raise ZeroDivisionError("{0!r} is too small".format(y1))

     da_avg = sum(abs_dlist) / len(abs_dlist)
     dr_avg = sum(rel_dlist) / len(rel_dlist)

     return (zip(abs_dlist, rel_dlist),
             (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
             )


 def calculate_distribution_properties(data):
     """chi, etc"""


 def minimal_measurement_amount(data, max_diff, req_probability):
     """
     should returns amount of measurements to get results (avg and deviation)
     with error less, that max_diff in at least req_probability% cases
     """


 class StatProps(object):
     def __init__(self):
         self.average = None
         self.mediana = None
         self.perc_95 = None
         self.perc_5 = None
         self.deviation = None
         self.confidence = None
         self.min = None
         self.max = None
         self.raw = None

     def rounded_average_conf(self):
         return round_deviation((self.average, self.confidence))

     def rounded_average_dev(self):
         return round_deviation((self.average, self.deviation))

     def __str__(self):
         return "StatProps({0} ~ {1})".format(round_3_digit(self.average),
                                              round_3_digit(self.deviation))

     def __repr__(self):
         return str(self)


 def data_property(data, confidence=0.95):
     res = StatProps()
     if len(data) == 0:
         return res

     data = sorted(data)
     res.average, res.deviation = med_dev(data)
     res.max = data[-1]
     res.min = data[0]

     ln = len(data)
     if ln % 2 == 0:
         res.mediana = (data[ln / 2] + data[ln / 2 - 1]) / 2
     else:
         res.mediana = data[ln / 2]

     res.perc_95 = data[int((ln - 1) * 0.95)]
     res.perc_5 = data[int((ln - 1) * 0.05)]

     if not no_numpy and ln >= 3:
         res.confidence = stats.sem(data) * \
                          stats.t.ppf((1 + confidence) / 2, ln - 1)
     else:
         res.confidence = res.deviation

     res.raw = data[:]
     return res
	import math
	import itertools

	try:
	from scipy import stats
	from numpy import array, linalg
	from scipy.optimize import leastsq
	from numpy.polynomial.chebyshev import chebfit, chebval
	no_numpy = False
	except ImportError:
	no_numpy = True


	def average(data):
	return sum(data) / len(data)


	def med_dev(vals):
	med = sum(vals) / len(vals)
	dev = ((sum(abs(med - i) 2.0 for i in vals) / len(vals)) 0.5)
	return med, dev


	def round_3_digit(val):
	return round_deviation((val, val / 10.0))[0]


	def round_deviation(med_dev):
	med, dev = med_dev

	if dev < 1E-7:
	return med_dev

	dev_div = 10.0 ** (math.floor(math.log10(dev)) - 1)
	dev = int(dev / dev_div) * dev_div
	med = int(med / dev_div) * dev_div
	return [type(med_dev[0])(med),
	type(med_dev[1])(dev)]


	def groupby_globally(data, key_func):
	grouped = {}
	grouped_iter = itertools.groupby(data, key_func)

	for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
	key = (bs, cache_tp, act, conc)
	grouped.setdefault(key, []).extend(curr_data_it)

	return grouped


	def approximate_curve(x, y, xnew, curved_coef):
	"""returns ynew - y values of some curve approximation"""
	if no_numpy:
	return None

	return chebval(xnew, chebfit(x, y, curved_coef))


	def approximate_line(x, y, xnew, relative_dist=False):
	""" x, y - test data, xnew - dots, where we want find approximation
	if not relative_dist distance = y - newy
	returns ynew - y values of linear approximation"""

	if no_numpy:
	return None

	# convert to numpy.array (don't work without it)
	ox = array(x)
	oy = array(y)

	# set approximation function
	def func_line(tpl, x):
	return tpl[0] * x + tpl[1]

	def error_func_rel(tpl, x, y):
	return 1.0 - y / func_line(tpl, x)

	def error_func_abs(tpl, x, y):
	return y - func_line(tpl, x)

	# choose distance mode
	error_func = error_func_rel if relative_dist else error_func_abs

	tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
	oy[:2]))

	# find line
	tpl_final, success = leastsq(error_func,
	tpl_initial[:],
	args=(ox, oy))

	# if error
	if success not in range(1, 5):
	raise ValueError("No line for this dots")

	# return new dots
	return func_line(tpl_final, array(xnew))


	def difference(y, ynew):
	"""returns average and maximum relative and
	absolute differences between y and ynew
	result may contain None values for y = 0
	return value - tuple:
	[(abs dif, rel dif) * len(y)],
	(abs average, abs max),
	(rel average, rel max)"""

	abs_dlist = []
	rel_dlist = []

	for y1, y2 in zip(y, ynew):
	# absolute
	abs_dlist.append(y1 - y2)

	if y1 > 1E-6:
	rel_dlist.append(abs(abs_dlist[-1] / y1))
	else:
	raise ZeroDivisionError("{0!r} is too small".format(y1))

	da_avg = sum(abs_dlist) / len(abs_dlist)
	dr_avg = sum(rel_dlist) / len(rel_dlist)

	return (zip(abs_dlist, rel_dlist),
	(da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
	)


	def calculate_distribution_properties(data):
	"""chi, etc"""


	def minimal_measurement_amount(data, max_diff, req_probability):
	"""
	should returns amount of measurements to get results (avg and deviation)
	with error less, that max_diff in at least req_probability% cases
	"""


	class StatProps(object):
	def __init__(self):
	self.average = None
	self.mediana = None
	self.perc_95 = None
	self.perc_5 = None
	self.deviation = None
	self.confidence = None
	self.min = None
	self.max = None
	self.raw = None

	def rounded_average_conf(self):
	return round_deviation((self.average, self.confidence))

	def rounded_average_dev(self):
	return round_deviation((self.average, self.deviation))

	def __str__(self):
	return "StatProps({0} ~ {1})".format(round_3_digit(self.average),
	round_3_digit(self.deviation))

	def __repr__(self):
	return str(self)


	def data_property(data, confidence=0.95):
	res = StatProps()
	if len(data) == 0:
	return res

	data = sorted(data)
	res.average, res.deviation = med_dev(data)
	res.max = data[-1]
	res.min = data[0]

	ln = len(data)
	if ln % 2 == 0:
	res.mediana = (data[ln / 2] + data[ln / 2 - 1]) / 2
	else:
	res.mediana = data[ln / 2]

	res.perc_95 = data[int((ln - 1) * 0.95)]
	res.perc_5 = data[int((ln - 1) * 0.05)]

	if not no_numpy and ln >= 3:
	res.confidence = stats.sem(data) * \
	stats.t.ppf((1 + confidence) / 2, ln - 1)
	else:
	res.confidence = res.deviation

	res.raw = data[:]
	return res