wally/statistic.py - mcp/cvp-wally - Gitiles

 import math
 import itertools
 import statistics
 from typing import Union, List, TypeVar, Callable, Iterable, Tuple, Any, cast, Dict

 import numpy
 from scipy import stats, optimize
 from numpy import linalg
 from numpy.polynomial.chebyshev import chebfit, chebval


 from .result_classes import IStorable


 Number = Union[int, float]
 TNumber = TypeVar('TNumber', int, float)


 DOUBLE_DELTA = 1e-8


 average = statistics.mean
 dev = statistics.variance


 class StatProps(IStorable):
     "Statistic properties for timeserie"

     yaml_tag = 'stat'

     def __init__(self, data: List[Number]) -> None:
         self.average = None  # type: float
         self.deviation = None  # type: float
         self.confidence = None  # type: float
         self.confidence_level = None  # type: float

         self.perc_99 = None  # type: float
         self.perc_95 = None  # type: float
         self.perc_90 = None  # type: float
         self.perc_50 = None   # type: float

         self.min = None  # type: Number
         self.max = None  # type: Number

         # bin_center: bin_count
         self.histo = None  # type: Tuple[List[int], List[float]]
         self.data = data

         self.normtest = None  # type: Any

     def __str__(self) -> str:
         res = ["StatProps(num_el={}):".format(len(self.data)),
                "    distr = {0.average} ~ {0.deviation}".format(self),
                "    confidence({0.confidence_level}) = {0.confidence}".format(self),
                "    perc50={0.perc50}".format(self),
                "    perc90={0.perc90}".format(self),
                "    perc95={0.perc95}".format(self),
                "    perc95={0.perc99}".format(self),
                "    range {0.min} {0.max}".format(self),
                "    nurmtest = {0.nortest}".format(self)]
         return "\n".join(res)

     def __repr__(self) -> str:
         return str(self)

     def raw(self) -> Dict[str, Any]:
         return self.__dict__.copy()

     @classmethod
     def fromraw(cls, data: Dict[str, Any]) -> 'StatProps':
         res = cls.__new__(cls)
         res.__dict__.update(data)
         return res


 def greater_digit_pos(val: Number) -> int:
     return int(math.floor(math.log10(val))) + 1


 def round_digits(val: TNumber, num_digits: int = 3) -> TNumber:
     pow = 10 ** (greater_digit_pos(val) - num_digits)
     return type(val)(int(val / pow) * pow)


 def calc_stat_props(data: List[Number], confidence: float = 0.95) -> StatProps:
     "Calculate statistical properties of array of numbers"

     res = StatProps(data)

     if len(data) == 0:
         raise ValueError("Input array is empty")

     data = sorted(data)
     res.average = average(data)
     res.deviation = dev(data)
     res.max = data[-1]
     res.min = data[0]

     res.perc_50 = numpy.percentile(data, 50)
     res.perc_90 = numpy.percentile(data, 90)
     res.perc_95 = numpy.percentile(data, 95)
     res.perc_99 = numpy.percentile(data, 99)

     if len(data) >= 3:
         res.confidence = stats.sem(data) * \
                          stats.t.ppf((1 + confidence) / 2, len(data) - 1)
     else:
         res.confidence = None

     res.histo = numpy.histogram(data, 'auto')
     res.normtest = stats.mstats.normaltest(data)
     return res


 def groupby_globally(data: Iterable, key_func: Callable):
     grouped = {}  # type: ignore
     grouped_iter = itertools.groupby(data, key_func)

     for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
         key = (bs, cache_tp, act, conc)
         grouped.setdefault(key, []).extend(curr_data_it)

     return grouped


 def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]:
     """returns ynew - y values of some curve approximation"""
     return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef)))


 def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]:
     """
     x, y - test data, xnew - dots, where we want find approximation
     if not relative_dist distance = y - newy
     returns ynew - y values of linear approximation
     """
     ox = numpy.array(x)
     oy = numpy.array(y)

     # set approximation function
     def func_line(tpl, x):
         return tpl[0] * x + tpl[1]

     def error_func_rel(tpl, x, y):
         return 1.0 - y / func_line(tpl, x)

     def error_func_abs(tpl, x, y):
         return y - func_line(tpl, x)

     # choose distance mode
     error_func = error_func_rel if relative_dist else error_func_abs

     tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
                                      oy[:2]))

     # find line
     tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy))

     # if error
     if success not in range(1, 5):
         raise ValueError("No line for this dots")

     # return new dots
     return func_line(tpl_final, numpy.array(xnew))


 # TODO: revise next
 # def difference(y, ynew):
 #     """returns average and maximum relative and
 #        absolute differences between y and ynew
 #        result may contain None values for y = 0
 #        return value - tuple:
 #        [(abs dif, rel dif) * len(y)],
 #        (abs average, abs max),
 #        (rel average, rel max)"""
 #
 #     abs_dlist = []
 #     rel_dlist = []
 #
 #     for y1, y2 in zip(y, ynew):
 #         # absolute
 #         abs_dlist.append(y1 - y2)
 #
 #         if y1 > 1E-6:
 #             rel_dlist.append(abs(abs_dlist[-1] / y1))
 #         else:
 #             raise ZeroDivisionError("{0!r} is too small".format(y1))
 #
 #     da_avg = sum(abs_dlist) / len(abs_dlist)
 #     dr_avg = sum(rel_dlist) / len(rel_dlist)
 #
 #     return (zip(abs_dlist, rel_dlist),
 #             (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
 #             )
	import math
	import itertools
	import statistics
	from typing import Union, List, TypeVar, Callable, Iterable, Tuple, Any, cast, Dict

	import numpy
	from scipy import stats, optimize
	from numpy import linalg
	from numpy.polynomial.chebyshev import chebfit, chebval


	from .result_classes import IStorable


	Number = Union[int, float]
	TNumber = TypeVar('TNumber', int, float)


	DOUBLE_DELTA = 1e-8


	average = statistics.mean
	dev = statistics.variance


	class StatProps(IStorable):
	"Statistic properties for timeserie"

	yaml_tag = 'stat'

	def __init__(self, data: List[Number]) -> None:
	self.average = None # type: float
	self.deviation = None # type: float
	self.confidence = None # type: float
	self.confidence_level = None # type: float

	self.perc_99 = None # type: float
	self.perc_95 = None # type: float
	self.perc_90 = None # type: float
	self.perc_50 = None # type: float

	self.min = None # type: Number
	self.max = None # type: Number

	# bin_center: bin_count
	self.histo = None # type: Tuple[List[int], List[float]]
	self.data = data

	self.normtest = None # type: Any

	def __str__(self) -> str:
	res = ["StatProps(num_el={}):".format(len(self.data)),
	" distr = {0.average} ~ {0.deviation}".format(self),
	" confidence({0.confidence_level}) = {0.confidence}".format(self),
	" perc50={0.perc50}".format(self),
	" perc90={0.perc90}".format(self),
	" perc95={0.perc95}".format(self),
	" perc95={0.perc99}".format(self),
	" range {0.min} {0.max}".format(self),
	" nurmtest = {0.nortest}".format(self)]
	return "\n".join(res)

	def __repr__(self) -> str:
	return str(self)

	def raw(self) -> Dict[str, Any]:
	return self.__dict__.copy()

	@classmethod
	def fromraw(cls, data: Dict[str, Any]) -> 'StatProps':
	res = cls.__new__(cls)
	res.__dict__.update(data)
	return res


	def greater_digit_pos(val: Number) -> int:
	return int(math.floor(math.log10(val))) + 1


	def round_digits(val: TNumber, num_digits: int = 3) -> TNumber:
	pow = 10 ** (greater_digit_pos(val) - num_digits)
	return type(val)(int(val / pow) * pow)


	def calc_stat_props(data: List[Number], confidence: float = 0.95) -> StatProps:
	"Calculate statistical properties of array of numbers"

	res = StatProps(data)

	if len(data) == 0:
	raise ValueError("Input array is empty")

	data = sorted(data)
	res.average = average(data)
	res.deviation = dev(data)
	res.max = data[-1]
	res.min = data[0]

	res.perc_50 = numpy.percentile(data, 50)
	res.perc_90 = numpy.percentile(data, 90)
	res.perc_95 = numpy.percentile(data, 95)
	res.perc_99 = numpy.percentile(data, 99)

	if len(data) >= 3:
	res.confidence = stats.sem(data) * \
	stats.t.ppf((1 + confidence) / 2, len(data) - 1)
	else:
	res.confidence = None

	res.histo = numpy.histogram(data, 'auto')
	res.normtest = stats.mstats.normaltest(data)
	return res


	def groupby_globally(data: Iterable, key_func: Callable):
	grouped = {} # type: ignore
	grouped_iter = itertools.groupby(data, key_func)

	for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
	key = (bs, cache_tp, act, conc)
	grouped.setdefault(key, []).extend(curr_data_it)

	return grouped


	def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]:
	"""returns ynew - y values of some curve approximation"""
	return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef)))


	def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]:
	"""
	x, y - test data, xnew - dots, where we want find approximation
	if not relative_dist distance = y - newy
	returns ynew - y values of linear approximation
	"""
	ox = numpy.array(x)
	oy = numpy.array(y)

	# set approximation function
	def func_line(tpl, x):
	return tpl[0] * x + tpl[1]

	def error_func_rel(tpl, x, y):
	return 1.0 - y / func_line(tpl, x)

	def error_func_abs(tpl, x, y):
	return y - func_line(tpl, x)

	# choose distance mode
	error_func = error_func_rel if relative_dist else error_func_abs

	tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
	oy[:2]))

	# find line
	tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy))

	# if error
	if success not in range(1, 5):
	raise ValueError("No line for this dots")

	# return new dots
	return func_line(tpl_final, numpy.array(xnew))


	# TODO: revise next
	# def difference(y, ynew):
	# """returns average and maximum relative and
	# absolute differences between y and ynew
	# result may contain None values for y = 0
	# return value - tuple:
	# [(abs dif, rel dif) * len(y)],
	# (abs average, abs max),
	# (rel average, rel max)"""
	#
	# abs_dlist = []
	# rel_dlist = []
	#
	# for y1, y2 in zip(y, ynew):
	# # absolute
	# abs_dlist.append(y1 - y2)
	#
	# if y1 > 1E-6:
	# rel_dlist.append(abs(abs_dlist[-1] / y1))
	# else:
	# raise ZeroDivisionError("{0!r} is too small".format(y1))
	#
	# da_avg = sum(abs_dlist) / len(abs_dlist)
	# dr_avg = sum(rel_dlist) / len(rel_dlist)
	#
	# return (zip(abs_dlist, rel_dlist),
	# (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
	# )