blob: 259ac69118b38e51a8fc7ae6c71a440b5bb222c4 [file] [log] [blame]
koder aka kdanilov6c491062015-04-09 22:33:13 +03001import math
koder aka kdanilovf2865172016-12-30 03:35:11 +02002import array
koder aka kdanilovffaf48d2016-12-27 02:25:29 +02003import logging
koder aka kdanilov6c491062015-04-09 22:33:13 +03004import itertools
koder aka kdanilov7f59d562016-12-26 01:34:23 +02005import statistics
koder aka kdanilovf2865172016-12-30 03:35:11 +02006from typing import List, Callable, Iterable, cast
koder aka kdanilovcff7b2e2015-04-18 20:48:15 +03007
koder aka kdanilov7f59d562016-12-26 01:34:23 +02008import numpy
9from scipy import stats, optimize
10from numpy import linalg
11from numpy.polynomial.chebyshev import chebfit, chebval
koder aka kdanilov6c491062015-04-09 22:33:13 +030012
13
koder aka kdanilovf2865172016-12-30 03:35:11 +020014from .result_classes import NormStatProps, HistoStatProps, TimeSeries
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020015from .utils import Number
koder aka kdanilovbb6d6cd2015-06-20 02:55:07 +030016
17
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020018logger = logging.getLogger("wally")
koder aka kdanilov7f59d562016-12-26 01:34:23 +020019DOUBLE_DELTA = 1e-8
koder aka kdanilove87ae652015-04-20 02:14:35 +030020
21
koder aka kdanilov7f59d562016-12-26 01:34:23 +020022average = statistics.mean
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020023dev = lambda x: math.sqrt(statistics.variance(x))
koder aka kdanilov6c491062015-04-09 22:33:13 +030024
25
koder aka kdanilovf2865172016-12-30 03:35:11 +020026def calc_norm_stat_props(ts: TimeSeries, confidence: float = 0.95) -> NormStatProps:
koder aka kdanilov7f59d562016-12-26 01:34:23 +020027 "Calculate statistical properties of array of numbers"
28
koder aka kdanilovf2865172016-12-30 03:35:11 +020029 data = ts.data
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020030 res = NormStatProps(data)
koder aka kdanilov7f59d562016-12-26 01:34:23 +020031
32 if len(data) == 0:
33 raise ValueError("Input array is empty")
34
35 data = sorted(data)
36 res.average = average(data)
37 res.deviation = dev(data)
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020038
koder aka kdanilov7f59d562016-12-26 01:34:23 +020039 res.max = data[-1]
40 res.min = data[0]
41
42 res.perc_50 = numpy.percentile(data, 50)
43 res.perc_90 = numpy.percentile(data, 90)
44 res.perc_95 = numpy.percentile(data, 95)
45 res.perc_99 = numpy.percentile(data, 99)
46
47 if len(data) >= 3:
48 res.confidence = stats.sem(data) * \
49 stats.t.ppf((1 + confidence) / 2, len(data) - 1)
50 else:
51 res.confidence = None
52
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020053 res.bin_populations, res.bin_edges = numpy.histogram(data, 'auto')
54
55 try:
56 res.normtest = stats.mstats.normaltest(data)
57 except Exception as exc:
58 logger.warning("stats.mstats.normaltest failed with error: %s", exc)
59
koder aka kdanilov7f59d562016-12-26 01:34:23 +020060 return res
61
62
koder aka kdanilovf2865172016-12-30 03:35:11 +020063def calc_histo_stat_props(ts: TimeSeries) -> HistoStatProps:
64 data = numpy.array(ts.data)
65 data.shape = [len(ts.data) // ts.second_axis_size, ts.second_axis_size]
66
67 res = HistoStatProps(ts.data, ts.second_axis_size)
68 aggregated = numpy.sum(data, axis=0)
69
70 full_sum = numpy.sum(aggregated)
71 expected = [full_sum * 0.5, full_sum * 0.9, full_sum * 0.95, full_sum * 0.99]
72 percentiles = []
73
74 val_min = None
75 val_max = None
76
77 for idx, val in enumerate(aggregated):
78 while expected and full_sum + val >= expected[0]:
79 percentiles.append(idx)
80 del expected[0]
81
82 full_sum += val
83
84 if val != 0:
85 if val_min is None:
86 val_min = idx
87 val_max = idx
88
89 res.perc_50, res.perc_90, res.perc_95, res.perc_99 = map(ts.bins_edges.__getitem__, percentiles)
90 res.min = ts.bins_edges[val_min]
91 res.max = ts.bins_edges[val_max]
92 res.bin_populations = aggregated
93 return res
94
95
koder aka kdanilov7f59d562016-12-26 01:34:23 +020096def groupby_globally(data: Iterable, key_func: Callable):
97 grouped = {} # type: ignore
koder aka kdanilov6c491062015-04-09 22:33:13 +030098 grouped_iter = itertools.groupby(data, key_func)
99
100 for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
101 key = (bs, cache_tp, act, conc)
102 grouped.setdefault(key, []).extend(curr_data_it)
103
104 return grouped
105
106
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200107def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]:
koder aka kdanilov6c491062015-04-09 22:33:13 +0300108 """returns ynew - y values of some curve approximation"""
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200109 return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef)))
koder aka kdanilov6c491062015-04-09 22:33:13 +0300110
111
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200112def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]:
113 """
114 x, y - test data, xnew - dots, where we want find approximation
115 if not relative_dist distance = y - newy
116 returns ynew - y values of linear approximation
117 """
118 ox = numpy.array(x)
119 oy = numpy.array(y)
koder aka kdanilov66839a92015-04-11 13:22:31 +0300120
Ved-vampir03166442015-04-10 17:28:23 +0300121 # set approximation function
koder aka kdanilov66839a92015-04-11 13:22:31 +0300122 def func_line(tpl, x):
123 return tpl[0] * x + tpl[1]
124
125 def error_func_rel(tpl, x, y):
126 return 1.0 - y / func_line(tpl, x)
127
128 def error_func_abs(tpl, x, y):
129 return y - func_line(tpl, x)
130
Ved-vampir03166442015-04-10 17:28:23 +0300131 # choose distance mode
koder aka kdanilov66839a92015-04-11 13:22:31 +0300132 error_func = error_func_rel if relative_dist else error_func_abs
133
134 tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
135 oy[:2]))
136
Ved-vampir03166442015-04-10 17:28:23 +0300137 # find line
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200138 tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy))
koder aka kdanilov66839a92015-04-11 13:22:31 +0300139
Ved-vampir03166442015-04-10 17:28:23 +0300140 # if error
141 if success not in range(1, 5):
142 raise ValueError("No line for this dots")
koder aka kdanilov66839a92015-04-11 13:22:31 +0300143
Ved-vampir03166442015-04-10 17:28:23 +0300144 # return new dots
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200145 return func_line(tpl_final, numpy.array(xnew))
koder aka kdanilov6c491062015-04-09 22:33:13 +0300146
147
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200148# TODO: revise next
149# def difference(y, ynew):
150# """returns average and maximum relative and
151# absolute differences between y and ynew
152# result may contain None values for y = 0
153# return value - tuple:
154# [(abs dif, rel dif) * len(y)],
155# (abs average, abs max),
156# (rel average, rel max)"""
157#
158# abs_dlist = []
159# rel_dlist = []
160#
161# for y1, y2 in zip(y, ynew):
162# # absolute
163# abs_dlist.append(y1 - y2)
164#
165# if y1 > 1E-6:
166# rel_dlist.append(abs(abs_dlist[-1] / y1))
167# else:
168# raise ZeroDivisionError("{0!r} is too small".format(y1))
169#
170# da_avg = sum(abs_dlist) / len(abs_dlist)
171# dr_avg = sum(rel_dlist) / len(rel_dlist)
172#
173# return (zip(abs_dlist, rel_dlist),
174# (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
175# )