wally/statistic.py - mcp/cvp-wally - Gitiles

 import math
 import logging
 import itertools
 import statistics
 from typing import List, Callable, Iterable, cast

 import numpy
 from scipy import stats, optimize
 from numpy import linalg
 from numpy.polynomial.chebyshev import chebfit, chebval


 from .result_classes import NormStatProps, HistoStatProps, TimeSeries
 from .utils import Number


 logger = logging.getLogger("wally")
 DOUBLE_DELTA = 1e-8
 MIN_VALUES_FOR_CONFIDENCE = 7


 average = numpy.mean
 dev = lambda x: math.sqrt(numpy.var(x, ddof=1))


 def calc_norm_stat_props(ts: TimeSeries, bins_count: int, confidence: float = 0.95) -> NormStatProps:
     "Calculate statistical properties of array of numbers"

     # array.array has very basic support
     data = cast(List[int], ts.data)
     res = NormStatProps(data)  # type: ignore

     if len(data) == 0:
         raise ValueError("Input array is empty")

     data = sorted(data)
     res.average = average(data)
     res.deviation = dev(data)

     res.max = data[-1]
     res.min = data[0]

     res.perc_50, res.perc_90, res.perc_99, res.perc_99 = numpy.percentile(data, q=[50., 90., 95., 99.])

     if len(data) >= MIN_VALUES_FOR_CONFIDENCE:
         res.confidence = stats.sem(data) * \
                          stats.t.ppf((1 + confidence) / 2, len(data) - 1)
         res.confidence_level = confidence
     else:
         res.confidence = None
         res.confidence_level = None

     res.bins_populations, bins_edges = numpy.histogram(data, bins=bins_count)
     res.bins_mids = (bins_edges[:-1] + bins_edges[1:]) / 2

     try:
         res.normtest = stats.mstats.normaltest(data)
     except Exception as exc:
         logger.warning("stats.mstats.normaltest failed with error: %s", exc)

     res.skew = stats.skew(data)
     res.kurt = stats.kurtosis(data)

     return res


 def calc_histo_stat_props(ts: TimeSeries,
                           bins_edges: numpy.array,
                           bins_count: int,
                           min_valuable: float = 0.0001) -> HistoStatProps:
     data = numpy.array(ts.data, dtype='int')
     data.shape = [len(ts.data) // ts.second_axis_size, ts.second_axis_size]  # type: ignore

     res = HistoStatProps(ts.data, ts.second_axis_size)

     # summ across all series
     aggregated = numpy.sum(data, axis=0, dtype='int')
     total = numpy.sum(aggregated)

     # minimal value used for histo
     min_val_on_histo = total * min_valuable

     # percentiles levels
     expected = [total * 0.5, total * 0.9, total * 0.95, total * 0.99]
     percentiles = []

     # all indexes, where values greater than min_val_on_histo
     valuable_idxs = []

     curr_summ = 0
     non_zero = aggregated.nonzero()[0]

     # calculate percentiles and valuable_indexes
     for idx in non_zero:
         val = aggregated[idx]
         while expected and curr_summ + val >= expected[0]:
             percentiles.append(bins_edges[idx])
             del expected[0]

         curr_summ += val

         if val >= min_val_on_histo:
             valuable_idxs.append(idx)

     res.perc_50, res.perc_90, res.perc_95, res.perc_99 = percentiles

     # minimax and maximal non-zero elements
     res.min = bins_edges[aggregated[non_zero[0]]]
     res.max = bins_edges[non_zero[-1] + (1 if non_zero[-1] != len(bins_edges) else 0)]

     # minimal and maximal valueble evelemts
     val_idx_min = valuable_idxs[0]
     val_idx_max = valuable_idxs[-1]

     raw_bins_populations = aggregated[val_idx_min: val_idx_max + 1]
     raw_bins_edges = bins_edges[val_idx_min: val_idx_max + 2]
     raw_bins_mids = cast(numpy.array, (raw_bins_edges[1:] + raw_bins_edges[:-1]) / 2)

     step = (raw_bins_mids[-1] + raw_bins_mids[0]) / bins_count
     next = raw_bins_mids[0]

     # aggregate raw histogram with many bins into result histogram with bins_count bins
     cidx = 0
     bins_populations = []
     bins_mids = []

     while cidx < len(raw_bins_mids):
         next += step
         bin_population = 0

         while cidx < len(raw_bins_mids) and raw_bins_mids[cidx] <= next:
             bin_population += raw_bins_populations[cidx]
             cidx += 1

         bins_populations.append(bin_population)
         bins_mids.append(next - step / 2)

     res.bins_populations = numpy.array(bins_populations, dtype='int')
     res.bins_mids = numpy.array(bins_mids, dtype='float32')

     return res


 def groupby_globally(data: Iterable, key_func: Callable):
     grouped = {}  # type: ignore
     grouped_iter = itertools.groupby(data, key_func)

     for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
         key = (bs, cache_tp, act, conc)
         grouped.setdefault(key, []).extend(curr_data_it)

     return grouped


 def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]:
     """returns ynew - y values of some curve approximation"""
     return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef)))


 def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]:
     """
     x, y - test data, xnew - dots, where we want find approximation
     if not relative_dist distance = y - newy
     returns ynew - y values of linear approximation
     """
     ox = numpy.array(x)
     oy = numpy.array(y)

     # set approximation function
     def func_line(tpl, x):
         return tpl[0] * x + tpl[1]

     def error_func_rel(tpl, x, y):
         return 1.0 - y / func_line(tpl, x)

     def error_func_abs(tpl, x, y):
         return y - func_line(tpl, x)

     # choose distance mode
     error_func = error_func_rel if relative_dist else error_func_abs

     tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
                                      oy[:2]))

     # find line
     tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy))

     # if error
     if success not in range(1, 5):
         raise ValueError("No line for this dots")

     # return new dots
     return func_line(tpl_final, numpy.array(xnew))


 # TODO: revise next
 # def difference(y, ynew):
 #     """returns average and maximum relative and
 #        absolute differences between y and ynew
 #        result may contain None values for y = 0
 #        return value - tuple:
 #        [(abs dif, rel dif) * len(y)],
 #        (abs average, abs max),
 #        (rel average, rel max)"""
 #
 #     abs_dlist = []
 #     rel_dlist = []
 #
 #     for y1, y2 in zip(y, ynew):
 #         # absolute
 #         abs_dlist.append(y1 - y2)
 #
 #         if y1 > 1E-6:
 #             rel_dlist.append(abs(abs_dlist[-1] / y1))
 #         else:
 #             raise ZeroDivisionError("{0!r} is too small".format(y1))
 #
 #     da_avg = sum(abs_dlist) / len(abs_dlist)
 #     dr_avg = sum(rel_dlist) / len(rel_dlist)
 #
 #     return (zip(abs_dlist, rel_dlist),
 #             (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
 #             )
	import math
	import logging
	import itertools
	import statistics
	from typing import List, Callable, Iterable, cast

	import numpy
	from scipy import stats, optimize
	from numpy import linalg
	from numpy.polynomial.chebyshev import chebfit, chebval


	from .result_classes import NormStatProps, HistoStatProps, TimeSeries
	from .utils import Number


	logger = logging.getLogger("wally")
	DOUBLE_DELTA = 1e-8
	MIN_VALUES_FOR_CONFIDENCE = 7


	average = numpy.mean
	dev = lambda x: math.sqrt(numpy.var(x, ddof=1))


	def calc_norm_stat_props(ts: TimeSeries, bins_count: int, confidence: float = 0.95) -> NormStatProps:
	"Calculate statistical properties of array of numbers"

	# array.array has very basic support
	data = cast(List[int], ts.data)
	res = NormStatProps(data) # type: ignore

	if len(data) == 0:
	raise ValueError("Input array is empty")

	data = sorted(data)
	res.average = average(data)
	res.deviation = dev(data)

	res.max = data[-1]
	res.min = data[0]

	res.perc_50, res.perc_90, res.perc_99, res.perc_99 = numpy.percentile(data, q=[50., 90., 95., 99.])

	if len(data) >= MIN_VALUES_FOR_CONFIDENCE:
	res.confidence = stats.sem(data) * \
	stats.t.ppf((1 + confidence) / 2, len(data) - 1)
	res.confidence_level = confidence
	else:
	res.confidence = None
	res.confidence_level = None

	res.bins_populations, bins_edges = numpy.histogram(data, bins=bins_count)
	res.bins_mids = (bins_edges[:-1] + bins_edges[1:]) / 2

	try:
	res.normtest = stats.mstats.normaltest(data)
	except Exception as exc:
	logger.warning("stats.mstats.normaltest failed with error: %s", exc)

	res.skew = stats.skew(data)
	res.kurt = stats.kurtosis(data)

	return res


	def calc_histo_stat_props(ts: TimeSeries,
	bins_edges: numpy.array,
	bins_count: int,
	min_valuable: float = 0.0001) -> HistoStatProps:
	data = numpy.array(ts.data, dtype='int')
	data.shape = [len(ts.data) // ts.second_axis_size, ts.second_axis_size] # type: ignore

	res = HistoStatProps(ts.data, ts.second_axis_size)

	# summ across all series
	aggregated = numpy.sum(data, axis=0, dtype='int')
	total = numpy.sum(aggregated)

	# minimal value used for histo
	min_val_on_histo = total * min_valuable

	# percentiles levels
	expected = [total * 0.5, total * 0.9, total * 0.95, total * 0.99]
	percentiles = []

	# all indexes, where values greater than min_val_on_histo
	valuable_idxs = []

	curr_summ = 0
	non_zero = aggregated.nonzero()[0]

	# calculate percentiles and valuable_indexes
	for idx in non_zero:
	val = aggregated[idx]
	while expected and curr_summ + val >= expected[0]:
	percentiles.append(bins_edges[idx])
	del expected[0]

	curr_summ += val

	if val >= min_val_on_histo:
	valuable_idxs.append(idx)

	res.perc_50, res.perc_90, res.perc_95, res.perc_99 = percentiles

	# minimax and maximal non-zero elements
	res.min = bins_edges[aggregated[non_zero[0]]]
	res.max = bins_edges[non_zero[-1] + (1 if non_zero[-1] != len(bins_edges) else 0)]

	# minimal and maximal valueble evelemts
	val_idx_min = valuable_idxs[0]
	val_idx_max = valuable_idxs[-1]

	raw_bins_populations = aggregated[val_idx_min: val_idx_max + 1]
	raw_bins_edges = bins_edges[val_idx_min: val_idx_max + 2]
	raw_bins_mids = cast(numpy.array, (raw_bins_edges[1:] + raw_bins_edges[:-1]) / 2)

	step = (raw_bins_mids[-1] + raw_bins_mids[0]) / bins_count
	next = raw_bins_mids[0]

	# aggregate raw histogram with many bins into result histogram with bins_count bins
	cidx = 0
	bins_populations = []
	bins_mids = []

	while cidx < len(raw_bins_mids):
	next += step
	bin_population = 0

	while cidx < len(raw_bins_mids) and raw_bins_mids[cidx] <= next:
	bin_population += raw_bins_populations[cidx]
	cidx += 1

	bins_populations.append(bin_population)
	bins_mids.append(next - step / 2)

	res.bins_populations = numpy.array(bins_populations, dtype='int')
	res.bins_mids = numpy.array(bins_mids, dtype='float32')

	return res


	def groupby_globally(data: Iterable, key_func: Callable):
	grouped = {} # type: ignore
	grouped_iter = itertools.groupby(data, key_func)

	for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
	key = (bs, cache_tp, act, conc)
	grouped.setdefault(key, []).extend(curr_data_it)

	return grouped


	def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]:
	"""returns ynew - y values of some curve approximation"""
	return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef)))


	def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]:
	"""
	x, y - test data, xnew - dots, where we want find approximation
	if not relative_dist distance = y - newy
	returns ynew - y values of linear approximation
	"""
	ox = numpy.array(x)
	oy = numpy.array(y)

	# set approximation function
	def func_line(tpl, x):
	return tpl[0] * x + tpl[1]

	def error_func_rel(tpl, x, y):
	return 1.0 - y / func_line(tpl, x)

	def error_func_abs(tpl, x, y):
	return y - func_line(tpl, x)

	# choose distance mode
	error_func = error_func_rel if relative_dist else error_func_abs

	tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
	oy[:2]))

	# find line
	tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy))

	# if error
	if success not in range(1, 5):
	raise ValueError("No line for this dots")

	# return new dots
	return func_line(tpl_final, numpy.array(xnew))


	# TODO: revise next
	# def difference(y, ynew):
	# """returns average and maximum relative and
	# absolute differences between y and ynew
	# result may contain None values for y = 0
	# return value - tuple:
	# [(abs dif, rel dif) * len(y)],
	# (abs average, abs max),
	# (rel average, rel max)"""
	#
	# abs_dlist = []
	# rel_dlist = []
	#
	# for y1, y2 in zip(y, ynew):
	# # absolute
	# abs_dlist.append(y1 - y2)
	#
	# if y1 > 1E-6:
	# rel_dlist.append(abs(abs_dlist[-1] / y1))
	# else:
	# raise ZeroDivisionError("{0!r} is too small".format(y1))
	#
	# da_avg = sum(abs_dlist) / len(abs_dlist)
	# dr_avg = sum(rel_dlist) / len(rel_dlist)
	#
	# return (zip(abs_dlist, rel_dlist),
	# (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
	# )