blob: b80fb22941efd809f3de3de5073d5c58d26ea97c [file] [log] [blame]
koder aka kdanilov6c491062015-04-09 22:33:13 +03001import math
koder aka kdanilovffaf48d2016-12-27 02:25:29 +02002import logging
koder aka kdanilov6c491062015-04-09 22:33:13 +03003import itertools
koder aka kdanilov7f59d562016-12-26 01:34:23 +02004import statistics
koder aka kdanilovf2865172016-12-30 03:35:11 +02005from typing import List, Callable, Iterable, cast
koder aka kdanilovcff7b2e2015-04-18 20:48:15 +03006
koder aka kdanilov7f59d562016-12-26 01:34:23 +02007import numpy
8from scipy import stats, optimize
9from numpy import linalg
10from numpy.polynomial.chebyshev import chebfit, chebval
koder aka kdanilov6c491062015-04-09 22:33:13 +030011
12
koder aka kdanilovf2865172016-12-30 03:35:11 +020013from .result_classes import NormStatProps, HistoStatProps, TimeSeries
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020014from .utils import Number
koder aka kdanilovbb6d6cd2015-06-20 02:55:07 +030015
16
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020017logger = logging.getLogger("wally")
koder aka kdanilov7f59d562016-12-26 01:34:23 +020018DOUBLE_DELTA = 1e-8
koder aka kdanilov108ac362017-01-19 20:17:16 +020019MIN_VALUES_FOR_CONFIDENCE = 7
koder aka kdanilove87ae652015-04-20 02:14:35 +030020
21
koder aka kdanilov108ac362017-01-19 20:17:16 +020022average = numpy.mean
23dev = lambda x: math.sqrt(numpy.var(x, ddof=1))
koder aka kdanilov6c491062015-04-09 22:33:13 +030024
25
koder aka kdanilov108ac362017-01-19 20:17:16 +020026def calc_norm_stat_props(ts: TimeSeries, bins_count: int, confidence: float = 0.95) -> NormStatProps:
koder aka kdanilov7f59d562016-12-26 01:34:23 +020027 "Calculate statistical properties of array of numbers"
28
koder aka kdanilov108ac362017-01-19 20:17:16 +020029 # array.array has very basic support
30 data = cast(List[int], ts.data)
31 res = NormStatProps(data) # type: ignore
koder aka kdanilov7f59d562016-12-26 01:34:23 +020032
33 if len(data) == 0:
34 raise ValueError("Input array is empty")
35
36 data = sorted(data)
37 res.average = average(data)
38 res.deviation = dev(data)
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020039
koder aka kdanilov7f59d562016-12-26 01:34:23 +020040 res.max = data[-1]
41 res.min = data[0]
42
koder aka kdanilov108ac362017-01-19 20:17:16 +020043 res.perc_50, res.perc_90, res.perc_99, res.perc_99 = numpy.percentile(data, q=[50., 90., 95., 99.])
koder aka kdanilov7f59d562016-12-26 01:34:23 +020044
koder aka kdanilov108ac362017-01-19 20:17:16 +020045 if len(data) >= MIN_VALUES_FOR_CONFIDENCE:
koder aka kdanilov7f59d562016-12-26 01:34:23 +020046 res.confidence = stats.sem(data) * \
47 stats.t.ppf((1 + confidence) / 2, len(data) - 1)
koder aka kdanilov108ac362017-01-19 20:17:16 +020048 res.confidence_level = confidence
koder aka kdanilov7f59d562016-12-26 01:34:23 +020049 else:
50 res.confidence = None
koder aka kdanilov108ac362017-01-19 20:17:16 +020051 res.confidence_level = None
koder aka kdanilov7f59d562016-12-26 01:34:23 +020052
koder aka kdanilov108ac362017-01-19 20:17:16 +020053 res.bins_populations, bins_edges = numpy.histogram(data, bins=bins_count)
54 res.bins_mids = (bins_edges[:-1] + bins_edges[1:]) / 2
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020055
56 try:
57 res.normtest = stats.mstats.normaltest(data)
58 except Exception as exc:
59 logger.warning("stats.mstats.normaltest failed with error: %s", exc)
60
koder aka kdanilov108ac362017-01-19 20:17:16 +020061 res.skew = stats.skew(data)
62 res.kurt = stats.kurtosis(data)
63
koder aka kdanilov7f59d562016-12-26 01:34:23 +020064 return res
65
66
koder aka kdanilov108ac362017-01-19 20:17:16 +020067def calc_histo_stat_props(ts: TimeSeries,
68 bins_edges: numpy.array,
69 bins_count: int,
70 min_valuable: float = 0.0001) -> HistoStatProps:
71 data = numpy.array(ts.data, dtype='int')
72 data.shape = [len(ts.data) // ts.second_axis_size, ts.second_axis_size] # type: ignore
koder aka kdanilovf2865172016-12-30 03:35:11 +020073
74 res = HistoStatProps(ts.data, ts.second_axis_size)
koder aka kdanilovf2865172016-12-30 03:35:11 +020075
koder aka kdanilov108ac362017-01-19 20:17:16 +020076 # summ across all series
77 aggregated = numpy.sum(data, axis=0, dtype='int')
78 total = numpy.sum(aggregated)
79
80 # minimal value used for histo
81 min_val_on_histo = total * min_valuable
82
83 # percentiles levels
84 expected = [total * 0.5, total * 0.9, total * 0.95, total * 0.99]
koder aka kdanilovf2865172016-12-30 03:35:11 +020085 percentiles = []
86
koder aka kdanilov108ac362017-01-19 20:17:16 +020087 # all indexes, where values greater than min_val_on_histo
88 valuable_idxs = []
koder aka kdanilovf2865172016-12-30 03:35:11 +020089
koder aka kdanilov108ac362017-01-19 20:17:16 +020090 curr_summ = 0
91 non_zero = aggregated.nonzero()[0]
92
93 # calculate percentiles and valuable_indexes
94 for idx in non_zero:
95 val = aggregated[idx]
96 while expected and curr_summ + val >= expected[0]:
97 percentiles.append(bins_edges[idx])
koder aka kdanilovf2865172016-12-30 03:35:11 +020098 del expected[0]
99
koder aka kdanilov108ac362017-01-19 20:17:16 +0200100 curr_summ += val
koder aka kdanilovf2865172016-12-30 03:35:11 +0200101
koder aka kdanilov108ac362017-01-19 20:17:16 +0200102 if val >= min_val_on_histo:
103 valuable_idxs.append(idx)
koder aka kdanilovf2865172016-12-30 03:35:11 +0200104
koder aka kdanilov108ac362017-01-19 20:17:16 +0200105 res.perc_50, res.perc_90, res.perc_95, res.perc_99 = percentiles
106
107 # minimax and maximal non-zero elements
108 res.min = bins_edges[aggregated[non_zero[0]]]
109 res.max = bins_edges[non_zero[-1] + (1 if non_zero[-1] != len(bins_edges) else 0)]
110
111 # minimal and maximal valueble evelemts
112 val_idx_min = valuable_idxs[0]
113 val_idx_max = valuable_idxs[-1]
114
115 raw_bins_populations = aggregated[val_idx_min: val_idx_max + 1]
116 raw_bins_edges = bins_edges[val_idx_min: val_idx_max + 2]
117 raw_bins_mids = cast(numpy.array, (raw_bins_edges[1:] + raw_bins_edges[:-1]) / 2)
118
119 step = (raw_bins_mids[-1] + raw_bins_mids[0]) / bins_count
120 next = raw_bins_mids[0]
121
122 # aggregate raw histogram with many bins into result histogram with bins_count bins
123 cidx = 0
124 bins_populations = []
125 bins_mids = []
126
127 while cidx < len(raw_bins_mids):
128 next += step
129 bin_population = 0
130
131 while cidx < len(raw_bins_mids) and raw_bins_mids[cidx] <= next:
132 bin_population += raw_bins_populations[cidx]
133 cidx += 1
134
135 bins_populations.append(bin_population)
136 bins_mids.append(next - step / 2)
137
138 res.bins_populations = numpy.array(bins_populations, dtype='int')
139 res.bins_mids = numpy.array(bins_mids, dtype='float32')
140
koder aka kdanilovf2865172016-12-30 03:35:11 +0200141 return res
142
143
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200144def groupby_globally(data: Iterable, key_func: Callable):
145 grouped = {} # type: ignore
koder aka kdanilov6c491062015-04-09 22:33:13 +0300146 grouped_iter = itertools.groupby(data, key_func)
147
148 for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
149 key = (bs, cache_tp, act, conc)
150 grouped.setdefault(key, []).extend(curr_data_it)
151
152 return grouped
153
154
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200155def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]:
koder aka kdanilov6c491062015-04-09 22:33:13 +0300156 """returns ynew - y values of some curve approximation"""
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200157 return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef)))
koder aka kdanilov6c491062015-04-09 22:33:13 +0300158
159
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200160def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]:
161 """
162 x, y - test data, xnew - dots, where we want find approximation
163 if not relative_dist distance = y - newy
164 returns ynew - y values of linear approximation
165 """
166 ox = numpy.array(x)
167 oy = numpy.array(y)
koder aka kdanilov66839a92015-04-11 13:22:31 +0300168
Ved-vampir03166442015-04-10 17:28:23 +0300169 # set approximation function
koder aka kdanilov66839a92015-04-11 13:22:31 +0300170 def func_line(tpl, x):
171 return tpl[0] * x + tpl[1]
172
173 def error_func_rel(tpl, x, y):
174 return 1.0 - y / func_line(tpl, x)
175
176 def error_func_abs(tpl, x, y):
177 return y - func_line(tpl, x)
178
Ved-vampir03166442015-04-10 17:28:23 +0300179 # choose distance mode
koder aka kdanilov66839a92015-04-11 13:22:31 +0300180 error_func = error_func_rel if relative_dist else error_func_abs
181
182 tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
183 oy[:2]))
184
Ved-vampir03166442015-04-10 17:28:23 +0300185 # find line
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200186 tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy))
koder aka kdanilov66839a92015-04-11 13:22:31 +0300187
Ved-vampir03166442015-04-10 17:28:23 +0300188 # if error
189 if success not in range(1, 5):
190 raise ValueError("No line for this dots")
koder aka kdanilov66839a92015-04-11 13:22:31 +0300191
Ved-vampir03166442015-04-10 17:28:23 +0300192 # return new dots
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200193 return func_line(tpl_final, numpy.array(xnew))
koder aka kdanilov6c491062015-04-09 22:33:13 +0300194
195
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200196# TODO: revise next
197# def difference(y, ynew):
198# """returns average and maximum relative and
199# absolute differences between y and ynew
200# result may contain None values for y = 0
201# return value - tuple:
202# [(abs dif, rel dif) * len(y)],
203# (abs average, abs max),
204# (rel average, rel max)"""
205#
206# abs_dlist = []
207# rel_dlist = []
208#
209# for y1, y2 in zip(y, ynew):
210# # absolute
211# abs_dlist.append(y1 - y2)
212#
213# if y1 > 1E-6:
214# rel_dlist.append(abs(abs_dlist[-1] / y1))
215# else:
216# raise ZeroDivisionError("{0!r} is too small".format(y1))
217#
218# da_avg = sum(abs_dlist) / len(abs_dlist)
219# dr_avg = sum(rel_dlist) / len(rel_dlist)
220#
221# return (zip(abs_dlist, rel_dlist),
222# (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
223# )