blob: 047f86d4817bda0a040c86da494733a147fccfc8 [file] [log] [blame]
koder aka kdanilov6c491062015-04-09 22:33:13 +03001import math
koder aka kdanilovffaf48d2016-12-27 02:25:29 +02002import logging
koder aka kdanilov6c491062015-04-09 22:33:13 +03003import itertools
koder aka kdanilova732a602017-02-01 20:29:56 +02004from typing import List, Callable, Iterable, cast, Tuple
koder aka kdanilovcff7b2e2015-04-18 20:48:15 +03005
koder aka kdanilov7f59d562016-12-26 01:34:23 +02006import numpy
7from scipy import stats, optimize
8from numpy import linalg
9from numpy.polynomial.chebyshev import chebfit, chebval
koder aka kdanilov6c491062015-04-09 22:33:13 +030010
11
koder aka kdanilovf2865172016-12-30 03:35:11 +020012from .result_classes import NormStatProps, HistoStatProps, TimeSeries
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020013from .utils import Number
koder aka kdanilovbb6d6cd2015-06-20 02:55:07 +030014
15
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020016logger = logging.getLogger("wally")
koder aka kdanilov7f59d562016-12-26 01:34:23 +020017DOUBLE_DELTA = 1e-8
koder aka kdanilov108ac362017-01-19 20:17:16 +020018MIN_VALUES_FOR_CONFIDENCE = 7
koder aka kdanilove87ae652015-04-20 02:14:35 +030019
20
koder aka kdanilov108ac362017-01-19 20:17:16 +020021average = numpy.mean
22dev = lambda x: math.sqrt(numpy.var(x, ddof=1))
koder aka kdanilov6c491062015-04-09 22:33:13 +030023
24
kdanylov aka koder150b2192017-04-01 16:53:01 +030025def calc_norm_stat_props(ts: TimeSeries, bins_count: int = None, confidence: float = 0.95) -> NormStatProps:
koder aka kdanilov7f59d562016-12-26 01:34:23 +020026 "Calculate statistical properties of array of numbers"
27
koder aka kdanilov108ac362017-01-19 20:17:16 +020028 # array.array has very basic support
29 data = cast(List[int], ts.data)
kdanylov aka koder45183182017-04-30 23:55:40 +030030 res = NormStatProps(data, ts.units) # type: ignore
koder aka kdanilov7f59d562016-12-26 01:34:23 +020031
32 if len(data) == 0:
33 raise ValueError("Input array is empty")
34
35 data = sorted(data)
36 res.average = average(data)
37 res.deviation = dev(data)
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020038
koder aka kdanilov7f59d562016-12-26 01:34:23 +020039 res.max = data[-1]
40 res.min = data[0]
41
koder aka kdanilova732a602017-02-01 20:29:56 +020042 pcs = numpy.percentile(data, q=[1.0, 5.0, 10., 50., 90., 95., 99.])
43 res.perc_1, res.perc_5, res.perc_10, res.perc_50, res.perc_90, res.perc_95, res.perc_99 = pcs
koder aka kdanilov7f59d562016-12-26 01:34:23 +020044
koder aka kdanilov108ac362017-01-19 20:17:16 +020045 if len(data) >= MIN_VALUES_FOR_CONFIDENCE:
koder aka kdanilov7f59d562016-12-26 01:34:23 +020046 res.confidence = stats.sem(data) * \
47 stats.t.ppf((1 + confidence) / 2, len(data) - 1)
koder aka kdanilov108ac362017-01-19 20:17:16 +020048 res.confidence_level = confidence
koder aka kdanilov7f59d562016-12-26 01:34:23 +020049 else:
50 res.confidence = None
koder aka kdanilov108ac362017-01-19 20:17:16 +020051 res.confidence_level = None
koder aka kdanilov7f59d562016-12-26 01:34:23 +020052
kdanylov aka koder150b2192017-04-01 16:53:01 +030053 if bins_count is not None:
54 res.bins_populations, res.bins_edges = numpy.histogram(data, bins=bins_count)
55 res.bins_edges = res.bins_edges[:-1]
koder aka kdanilovffaf48d2016-12-27 02:25:29 +020056
57 try:
58 res.normtest = stats.mstats.normaltest(data)
59 except Exception as exc:
60 logger.warning("stats.mstats.normaltest failed with error: %s", exc)
61
koder aka kdanilov108ac362017-01-19 20:17:16 +020062 res.skew = stats.skew(data)
63 res.kurt = stats.kurtosis(data)
64
koder aka kdanilov7f59d562016-12-26 01:34:23 +020065 return res
66
67
koder aka kdanilova732a602017-02-01 20:29:56 +020068# update this code
69def rebin_histogram(bins_populations: numpy.array,
70 bins_edges: numpy.array,
71 new_bins_count: int,
72 left_tail_idx: int = None,
73 right_tail_idx: int = None,
74 log_bins: bool = False) -> Tuple[numpy.array, numpy.array]:
75 # rebin large histogram into smaller with new_bins bins, linearly distributes across
76 # left_tail_idx:right_tail_idx range
77
78 assert len(bins_populations.shape) == 1
79 assert len(bins_edges.shape) == 1
80 assert bins_edges.shape[0] == bins_populations.shape[0]
81
82 if left_tail_idx is None:
83 min_val = bins_edges[0]
84 else:
85 min_val = bins_edges[left_tail_idx]
86
87 if right_tail_idx is None:
88 max_val = bins_edges[-1]
89 else:
90 max_val = bins_edges[right_tail_idx]
91
92 if log_bins:
93 assert min_val > 1E-3
94 step = (max_val / min_val) ** (1 / new_bins_count)
95 new_bins_edges = min_val * (step ** numpy.arange(new_bins_count)) # type: numpy.array
96 else:
97 new_bins_edges = numpy.linspace(min_val, max_val, new_bins_count + 1, dtype='float')[:-1] # type: numpy.array
98
99 old_bins_pos = numpy.searchsorted(new_bins_edges, bins_edges, side='right')
100 new_bins = numpy.zeros(new_bins_count, dtype=int) # type: numpy.array
101
102 # last source bin can't be split
103 # TODO: need to add assert for this
104 new_bins[-1] += bins_populations[-1]
105 bin_sizes = bins_edges[1:] - bins_edges[:-1]
106
107 # correct position to get bin idx from edge idx
108 old_bins_pos -= 1
109 old_bins_pos[old_bins_pos < 0] = 0
110 new_bins_sizes = new_bins_edges[1:] - new_bins_edges[:-1]
111
112 for population, begin, end, bsize in zip(bins_populations[:-1], old_bins_pos[:-1], old_bins_pos[1:], bin_sizes):
113 if begin == end:
114 new_bins[begin] += population
115 else:
116 density = population / bsize
117 for curr_box in range(begin, end):
118 cnt = min(int(new_bins_sizes[begin] * density + 0.5), population)
119 new_bins[begin] += cnt
120 population -= cnt
121
122 return new_bins, new_bins_edges
123
124
koder aka kdanilov108ac362017-01-19 20:17:16 +0200125def calc_histo_stat_props(ts: TimeSeries,
kdanylov aka kodercdfcdaf2017-04-29 10:03:39 +0300126 bins_edges: numpy.array = None,
kdanylov aka koder150b2192017-04-01 16:53:01 +0300127 rebins_count: int = None,
koder aka kdanilova732a602017-02-01 20:29:56 +0200128 tail: float = 0.005) -> HistoStatProps:
kdanylov aka kodercdfcdaf2017-04-29 10:03:39 +0300129 if bins_edges is None:
130 bins_edges = ts.histo_bins
131
kdanylov aka koder45183182017-04-30 23:55:40 +0300132 res = HistoStatProps(ts.data, ts.units)
koder aka kdanilovf2865172016-12-30 03:35:11 +0200133
koder aka kdanilov108ac362017-01-19 20:17:16 +0200134 # summ across all series
koder aka kdanilova732a602017-02-01 20:29:56 +0200135 aggregated = ts.data.sum(axis=0, dtype='int')
136 total = aggregated.sum()
koder aka kdanilov108ac362017-01-19 20:17:16 +0200137
138 # percentiles levels
koder aka kdanilova732a602017-02-01 20:29:56 +0200139 expected = list(numpy.array([0.01, 0.05, 0.1, 0.5, 0.9, 0.95, 0.99]) * total)
140 cumsum = numpy.cumsum(aggregated)
koder aka kdanilovf2865172016-12-30 03:35:11 +0200141
koder aka kdanilova732a602017-02-01 20:29:56 +0200142 percentiles_bins = numpy.searchsorted(cumsum, expected)
143 percentiles = bins_edges[percentiles_bins]
144 res.perc_1, res.perc_5, res.perc_10, res.perc_50, res.perc_90, res.perc_95, res.perc_99 = percentiles
koder aka kdanilovf2865172016-12-30 03:35:11 +0200145
koder aka kdanilova732a602017-02-01 20:29:56 +0200146 # don't show tail ranges on histogram
147 left_tail_idx, right_tail_idx = numpy.searchsorted(cumsum, [tail * total, (1 - tail) * total])
koder aka kdanilov108ac362017-01-19 20:17:16 +0200148
149 # minimax and maximal non-zero elements
koder aka kdanilova732a602017-02-01 20:29:56 +0200150 non_zero = numpy.nonzero(aggregated)[0]
koder aka kdanilov108ac362017-01-19 20:17:16 +0200151 res.min = bins_edges[aggregated[non_zero[0]]]
152 res.max = bins_edges[non_zero[-1] + (1 if non_zero[-1] != len(bins_edges) else 0)]
153
koder aka kdanilova732a602017-02-01 20:29:56 +0200154 res.log_bins = False
kdanylov aka koder150b2192017-04-01 16:53:01 +0300155 if rebins_count is not None:
156 res.bins_populations, res.bins_edges = rebin_histogram(aggregated, bins_edges, rebins_count,
157 left_tail_idx, right_tail_idx)
158 else:
159 res.bins_populations = aggregated
160 res.bins_edges = bins_edges.copy()
koder aka kdanilov108ac362017-01-19 20:17:16 +0200161
koder aka kdanilovf2865172016-12-30 03:35:11 +0200162 return res
163
164
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200165def groupby_globally(data: Iterable, key_func: Callable):
166 grouped = {} # type: ignore
koder aka kdanilov6c491062015-04-09 22:33:13 +0300167 grouped_iter = itertools.groupby(data, key_func)
168
169 for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
170 key = (bs, cache_tp, act, conc)
171 grouped.setdefault(key, []).extend(curr_data_it)
172
173 return grouped
174
175
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200176def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]:
koder aka kdanilov6c491062015-04-09 22:33:13 +0300177 """returns ynew - y values of some curve approximation"""
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200178 return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef)))
koder aka kdanilov6c491062015-04-09 22:33:13 +0300179
180
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200181def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]:
182 """
183 x, y - test data, xnew - dots, where we want find approximation
184 if not relative_dist distance = y - newy
185 returns ynew - y values of linear approximation
186 """
187 ox = numpy.array(x)
188 oy = numpy.array(y)
koder aka kdanilov66839a92015-04-11 13:22:31 +0300189
Ved-vampir03166442015-04-10 17:28:23 +0300190 # set approximation function
koder aka kdanilov66839a92015-04-11 13:22:31 +0300191 def func_line(tpl, x):
192 return tpl[0] * x + tpl[1]
193
194 def error_func_rel(tpl, x, y):
195 return 1.0 - y / func_line(tpl, x)
196
197 def error_func_abs(tpl, x, y):
198 return y - func_line(tpl, x)
199
Ved-vampir03166442015-04-10 17:28:23 +0300200 # choose distance mode
koder aka kdanilov66839a92015-04-11 13:22:31 +0300201 error_func = error_func_rel if relative_dist else error_func_abs
202
203 tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
204 oy[:2]))
205
Ved-vampir03166442015-04-10 17:28:23 +0300206 # find line
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200207 tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy))
koder aka kdanilov66839a92015-04-11 13:22:31 +0300208
Ved-vampir03166442015-04-10 17:28:23 +0300209 # if error
210 if success not in range(1, 5):
211 raise ValueError("No line for this dots")
koder aka kdanilov66839a92015-04-11 13:22:31 +0300212
Ved-vampir03166442015-04-10 17:28:23 +0300213 # return new dots
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200214 return func_line(tpl_final, numpy.array(xnew))
koder aka kdanilov6c491062015-04-09 22:33:13 +0300215
216
koder aka kdanilova732a602017-02-01 20:29:56 +0200217def moving_average(data: numpy.array, window: int) -> numpy.array:
218 cumsum = numpy.cumsum(data)
219 cumsum[window:] = cumsum[window:] - cumsum[:-window]
220 return cumsum[window - 1:] / window
221
222
223def moving_dev(data: numpy.array, window: int) -> numpy.array:
224 cumsum = numpy.cumsum(data)
225 cumsum2 = numpy.cumsum(data ** 2)
226 cumsum[window:] = cumsum[window:] - cumsum[:-window]
227 cumsum2[window:] = cumsum2[window:] - cumsum2[:-window]
228 return ((cumsum2[window - 1:] - cumsum[window - 1:] ** 2 / window) / (window - 1)) ** 0.5
229
230
231def find_ouliers(data: numpy.array,
232 center_range: Tuple[int, int] = (25, 75),
kdanylov aka koder0e0cfcb2017-03-27 22:19:09 +0300233 cut_range: float = 3.0) -> numpy.array:
koder aka kdanilova732a602017-02-01 20:29:56 +0200234 v1, v2 = numpy.percentile(data, center_range)
235 return numpy.abs(data - (v1 + v2) / 2) > ((v2 - v1) / 2 * cut_range)
236
237
238def find_ouliers_ts(data: numpy.array,
239 windows_size: int = 30,
240 center_range: Tuple[int, int] = (25, 75),
kdanylov aka koder0e0cfcb2017-03-27 22:19:09 +0300241 cut_range: float = 3.0) -> numpy.array:
koder aka kdanilova732a602017-02-01 20:29:56 +0200242 outliers = numpy.empty(data.shape, dtype=bool)
243
244 if len(data) < windows_size:
245 outliers[:] = False
246 return outliers
247
248 begin_idx = 0
249 if len(data) < windows_size * 2:
250 end_idx = (len(data) % windows_size) // 2 + windows_size
251 else:
252 end_idx = len(data)
253
254 while True:
255 cdata = data[begin_idx: end_idx]
256 outliers[begin_idx: end_idx] = find_ouliers(cdata, center_range, cut_range)
257 begin_idx = end_idx
258
259 if end_idx == len(data):
260 break
261
262 end_idx += windows_size
263 if len(data) - end_idx < windows_size:
264 end_idx = len(data)
265
266 return outliers
267
268
269def hist_outliers_nd(bin_populations: numpy.array,
270 bin_centers: numpy.array,
271 center_range: Tuple[int, int] = (25, 75),
272 cut_range: float = 3.0) -> Tuple[int, int]:
273 assert len(bin_populations) == len(bin_centers)
274 total_count = bin_populations.sum()
275
276 perc25 = total_count / 100.0 * center_range[0]
277 perc75 = total_count / 100.0 * center_range[1]
278
279 perc25_idx, perc75_idx = numpy.searchsorted(numpy.cumsum(bin_populations), [perc25, perc75])
280 middle = (bin_centers[perc75_idx] + bin_centers[perc25_idx]) / 2
281 r = (bin_centers[perc75_idx] - bin_centers[perc25_idx]) / 2
282
283 lower_bound = middle - r * cut_range
284 upper_bound = middle + r * cut_range
285
286 lower_cut_idx, upper_cut_idx = numpy.searchsorted(bin_centers, [lower_bound, upper_bound])
287 return lower_cut_idx, upper_cut_idx
288
289
290def hist_outliers_perc(bin_populations: numpy.array,
kdanylov aka kodercdfcdaf2017-04-29 10:03:39 +0300291 bounds_perc: Tuple[float, float] = (0.01, 0.99),
292 min_bins_left: int = None) -> Tuple[int, int]:
koder aka kdanilova732a602017-02-01 20:29:56 +0200293 assert len(bin_populations.shape) == 1
294 total_count = bin_populations.sum()
295 lower_perc = total_count * bounds_perc[0]
296 upper_perc = total_count * bounds_perc[1]
kdanylov aka kodercdfcdaf2017-04-29 10:03:39 +0300297 idx1, idx2 = numpy.searchsorted(numpy.cumsum(bin_populations), [lower_perc, upper_perc])
298
299 # don't cut too many bins. At least min_bins_left must left
300 if min_bins_left is not None and idx2 - idx1 < min_bins_left:
301 missed = min_bins_left - (idx2 - idx1) // 2
302 idx2 = min(len(bin_populations), idx2 + missed)
303 idx1 = max(0, idx1 - missed)
304
305 return idx1, idx2
koder aka kdanilova732a602017-02-01 20:29:56 +0200306
307
308def ts_hist_outliers_perc(bin_populations: numpy.array,
309 window_size: int = 10,
kdanylov aka kodercdfcdaf2017-04-29 10:03:39 +0300310 bounds_perc: Tuple[float, float] = (0.01, 0.99),
311 min_bins_left: int = None) -> Tuple[int, int]:
koder aka kdanilova732a602017-02-01 20:29:56 +0200312 assert len(bin_populations.shape) == 2
313
314 points = list(range(0, len(bin_populations), window_size))
315 if len(bin_populations) % window_size != 0:
316 points.append(points[-1] + window_size)
317
kdanylov aka kodercdfcdaf2017-04-29 10:03:39 +0300318 ranges = [] # type: List[List[int]]
koder aka kdanilova732a602017-02-01 20:29:56 +0200319 for begin, end in zip(points[:-1], points[1:]):
320 window_hist = bin_populations[begin:end].sum(axis=0)
kdanylov aka kodercdfcdaf2017-04-29 10:03:39 +0300321 ranges.append(hist_outliers_perc(window_hist, bounds_perc=bounds_perc, min_bins_left=min_bins_left))
koder aka kdanilova732a602017-02-01 20:29:56 +0200322
323 return min(i[0] for i in ranges), max(i[1] for i in ranges)
324
325
koder aka kdanilov7f59d562016-12-26 01:34:23 +0200326# TODO: revise next
327# def difference(y, ynew):
328# """returns average and maximum relative and
329# absolute differences between y and ynew
330# result may contain None values for y = 0
331# return value - tuple:
332# [(abs dif, rel dif) * len(y)],
333# (abs average, abs max),
334# (rel average, rel max)"""
335#
336# abs_dlist = []
337# rel_dlist = []
338#
339# for y1, y2 in zip(y, ynew):
340# # absolute
341# abs_dlist.append(y1 - y2)
342#
343# if y1 > 1E-6:
344# rel_dlist.append(abs(abs_dlist[-1] / y1))
345# else:
346# raise ZeroDivisionError("{0!r} is too small".format(y1))
347#
348# da_avg = sum(abs_dlist) / len(abs_dlist)
349# dr_avg = sum(rel_dlist) / len(rel_dlist)
350#
351# return (zip(abs_dlist, rel_dlist),
352# (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
353# )