blob: 2f68ca5ae48742945c263c8aee76be6c5c41e440 [file] [log] [blame]
koder aka kdanilov6c491062015-04-09 22:33:13 +03001import math
koder aka kdanilovffaf48d2016-12-27 02:25:29 +02002import logging
koder aka kdanilov6c491062015-04-09 22:33:13 +03003import itertools
koder aka kdanilov7f59d562016-12-26 01:34:23 +02004import statistics
5from typing import Union, List, TypeVar, Callable, Iterable, Tuple, Any, cast, Dict
koder aka kdanilovcff7b2e2015-04-18 20:48:15 +03006
koder aka kdanilov7f59d562016-12-26 01:34:23 +02007import numpy
8from scipy import stats, optimize
9from numpy import linalg
10from numpy.polynomial.chebyshev import chebfit, chebval
koder aka kdanilov6c491062015-04-09 22:33:13 +030011
12
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020013from .result_classes import NormStatProps
14from .utils import Number
koder aka kdanilovbb6d6cd2015-06-20 02:55:07 +030015
16
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020017logger = logging.getLogger("wally")
koder aka kdanilov7f59d562016-12-26 01:34:23 +020018DOUBLE_DELTA = 1e-8
koder aka kdanilove87ae652015-04-20 02:14:35 +030019
20
koder aka kdanilov7f59d562016-12-26 01:34:23 +020021average = statistics.mean
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020022dev = lambda x: math.sqrt(statistics.variance(x))
koder aka kdanilov6c491062015-04-09 22:33:13 +030023
24
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020025def calc_norm_stat_props(data: List[Number], confidence: float = 0.95) -> NormStatProps:
koder aka kdanilov7f59d562016-12-26 01:34:23 +020026 "Calculate statistical properties of array of numbers"
27
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020028 res = NormStatProps(data)
koder aka kdanilov7f59d562016-12-26 01:34:23 +020029
30 if len(data) == 0:
31 raise ValueError("Input array is empty")
32
33 data = sorted(data)
34 res.average = average(data)
35 res.deviation = dev(data)
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020036
koder aka kdanilov7f59d562016-12-26 01:34:23 +020037 res.max = data[-1]
38 res.min = data[0]
39
40 res.perc_50 = numpy.percentile(data, 50)
41 res.perc_90 = numpy.percentile(data, 90)
42 res.perc_95 = numpy.percentile(data, 95)
43 res.perc_99 = numpy.percentile(data, 99)
44
45 if len(data) >= 3:
46 res.confidence = stats.sem(data) * \
47 stats.t.ppf((1 + confidence) / 2, len(data) - 1)
48 else:
49 res.confidence = None
50
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020051 res.bin_populations, res.bin_edges = numpy.histogram(data, 'auto')
52
53 try:
54 res.normtest = stats.mstats.normaltest(data)
55 except Exception as exc:
56 logger.warning("stats.mstats.normaltest failed with error: %s", exc)
57
koder aka kdanilov7f59d562016-12-26 01:34:23 +020058 return res
59
60
61def groupby_globally(data: Iterable, key_func: Callable):
62 grouped = {} # type: ignore
koder aka kdanilov6c491062015-04-09 22:33:13 +030063 grouped_iter = itertools.groupby(data, key_func)
64
65 for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
66 key = (bs, cache_tp, act, conc)
67 grouped.setdefault(key, []).extend(curr_data_it)
68
69 return grouped
70
71
koder aka kdanilov7f59d562016-12-26 01:34:23 +020072def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]:
koder aka kdanilov6c491062015-04-09 22:33:13 +030073 """returns ynew - y values of some curve approximation"""
koder aka kdanilov7f59d562016-12-26 01:34:23 +020074 return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef)))
koder aka kdanilov6c491062015-04-09 22:33:13 +030075
76
koder aka kdanilov7f59d562016-12-26 01:34:23 +020077def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]:
78 """
79 x, y - test data, xnew - dots, where we want find approximation
80 if not relative_dist distance = y - newy
81 returns ynew - y values of linear approximation
82 """
83 ox = numpy.array(x)
84 oy = numpy.array(y)
koder aka kdanilov66839a92015-04-11 13:22:31 +030085
Ved-vampir03166442015-04-10 17:28:23 +030086 # set approximation function
koder aka kdanilov66839a92015-04-11 13:22:31 +030087 def func_line(tpl, x):
88 return tpl[0] * x + tpl[1]
89
90 def error_func_rel(tpl, x, y):
91 return 1.0 - y / func_line(tpl, x)
92
93 def error_func_abs(tpl, x, y):
94 return y - func_line(tpl, x)
95
Ved-vampir03166442015-04-10 17:28:23 +030096 # choose distance mode
koder aka kdanilov66839a92015-04-11 13:22:31 +030097 error_func = error_func_rel if relative_dist else error_func_abs
98
99 tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
100 oy[:2]))
101
Ved-vampir03166442015-04-10 17:28:23 +0300102 # find line
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200103 tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy))
koder aka kdanilov66839a92015-04-11 13:22:31 +0300104
Ved-vampir03166442015-04-10 17:28:23 +0300105 # if error
106 if success not in range(1, 5):
107 raise ValueError("No line for this dots")
koder aka kdanilov66839a92015-04-11 13:22:31 +0300108
Ved-vampir03166442015-04-10 17:28:23 +0300109 # return new dots
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200110 return func_line(tpl_final, numpy.array(xnew))
koder aka kdanilov6c491062015-04-09 22:33:13 +0300111
112
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200113# TODO: revise next
114# def difference(y, ynew):
115# """returns average and maximum relative and
116# absolute differences between y and ynew
117# result may contain None values for y = 0
118# return value - tuple:
119# [(abs dif, rel dif) * len(y)],
120# (abs average, abs max),
121# (rel average, rel max)"""
122#
123# abs_dlist = []
124# rel_dlist = []
125#
126# for y1, y2 in zip(y, ynew):
127# # absolute
128# abs_dlist.append(y1 - y2)
129#
130# if y1 > 1E-6:
131# rel_dlist.append(abs(abs_dlist[-1] / y1))
132# else:
133# raise ZeroDivisionError("{0!r} is too small".format(y1))
134#
135# da_avg = sum(abs_dlist) / len(abs_dlist)
136# dr_avg = sum(rel_dlist) / len(rel_dlist)
137#
138# return (zip(abs_dlist, rel_dlist),
139# (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
140# )