koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 1 | import math |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame^] | 2 | import array |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 3 | import logging |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 4 | import itertools |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 5 | import statistics |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame^] | 6 | from typing import List, Callable, Iterable, cast |
koder aka kdanilov | cff7b2e | 2015-04-18 20:48:15 +0300 | [diff] [blame] | 7 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 8 | import numpy |
| 9 | from scipy import stats, optimize |
| 10 | from numpy import linalg |
| 11 | from numpy.polynomial.chebyshev import chebfit, chebval |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 12 | |
| 13 | |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame^] | 14 | from .result_classes import NormStatProps, HistoStatProps, TimeSeries |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 15 | from .utils import Number |
koder aka kdanilov | bb6d6cd | 2015-06-20 02:55:07 +0300 | [diff] [blame] | 16 | |
| 17 | |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 18 | logger = logging.getLogger("wally") |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 19 | DOUBLE_DELTA = 1e-8 |
koder aka kdanilov | e87ae65 | 2015-04-20 02:14:35 +0300 | [diff] [blame] | 20 | |
| 21 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 22 | average = statistics.mean |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 23 | dev = lambda x: math.sqrt(statistics.variance(x)) |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 24 | |
| 25 | |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame^] | 26 | def calc_norm_stat_props(ts: TimeSeries, confidence: float = 0.95) -> NormStatProps: |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 27 | "Calculate statistical properties of array of numbers" |
| 28 | |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame^] | 29 | data = ts.data |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 30 | res = NormStatProps(data) |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 31 | |
| 32 | if len(data) == 0: |
| 33 | raise ValueError("Input array is empty") |
| 34 | |
| 35 | data = sorted(data) |
| 36 | res.average = average(data) |
| 37 | res.deviation = dev(data) |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 38 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 39 | res.max = data[-1] |
| 40 | res.min = data[0] |
| 41 | |
| 42 | res.perc_50 = numpy.percentile(data, 50) |
| 43 | res.perc_90 = numpy.percentile(data, 90) |
| 44 | res.perc_95 = numpy.percentile(data, 95) |
| 45 | res.perc_99 = numpy.percentile(data, 99) |
| 46 | |
| 47 | if len(data) >= 3: |
| 48 | res.confidence = stats.sem(data) * \ |
| 49 | stats.t.ppf((1 + confidence) / 2, len(data) - 1) |
| 50 | else: |
| 51 | res.confidence = None |
| 52 | |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 53 | res.bin_populations, res.bin_edges = numpy.histogram(data, 'auto') |
| 54 | |
| 55 | try: |
| 56 | res.normtest = stats.mstats.normaltest(data) |
| 57 | except Exception as exc: |
| 58 | logger.warning("stats.mstats.normaltest failed with error: %s", exc) |
| 59 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 60 | return res |
| 61 | |
| 62 | |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame^] | 63 | def calc_histo_stat_props(ts: TimeSeries) -> HistoStatProps: |
| 64 | data = numpy.array(ts.data) |
| 65 | data.shape = [len(ts.data) // ts.second_axis_size, ts.second_axis_size] |
| 66 | |
| 67 | res = HistoStatProps(ts.data, ts.second_axis_size) |
| 68 | aggregated = numpy.sum(data, axis=0) |
| 69 | |
| 70 | full_sum = numpy.sum(aggregated) |
| 71 | expected = [full_sum * 0.5, full_sum * 0.9, full_sum * 0.95, full_sum * 0.99] |
| 72 | percentiles = [] |
| 73 | |
| 74 | val_min = None |
| 75 | val_max = None |
| 76 | |
| 77 | for idx, val in enumerate(aggregated): |
| 78 | while expected and full_sum + val >= expected[0]: |
| 79 | percentiles.append(idx) |
| 80 | del expected[0] |
| 81 | |
| 82 | full_sum += val |
| 83 | |
| 84 | if val != 0: |
| 85 | if val_min is None: |
| 86 | val_min = idx |
| 87 | val_max = idx |
| 88 | |
| 89 | res.perc_50, res.perc_90, res.perc_95, res.perc_99 = map(ts.bins_edges.__getitem__, percentiles) |
| 90 | res.min = ts.bins_edges[val_min] |
| 91 | res.max = ts.bins_edges[val_max] |
| 92 | res.bin_populations = aggregated |
| 93 | return res |
| 94 | |
| 95 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 96 | def groupby_globally(data: Iterable, key_func: Callable): |
| 97 | grouped = {} # type: ignore |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 98 | grouped_iter = itertools.groupby(data, key_func) |
| 99 | |
| 100 | for (bs, cache_tp, act, conc), curr_data_it in grouped_iter: |
| 101 | key = (bs, cache_tp, act, conc) |
| 102 | grouped.setdefault(key, []).extend(curr_data_it) |
| 103 | |
| 104 | return grouped |
| 105 | |
| 106 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 107 | def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]: |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 108 | """returns ynew - y values of some curve approximation""" |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 109 | return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef))) |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 110 | |
| 111 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 112 | def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]: |
| 113 | """ |
| 114 | x, y - test data, xnew - dots, where we want find approximation |
| 115 | if not relative_dist distance = y - newy |
| 116 | returns ynew - y values of linear approximation |
| 117 | """ |
| 118 | ox = numpy.array(x) |
| 119 | oy = numpy.array(y) |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 120 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 121 | # set approximation function |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 122 | def func_line(tpl, x): |
| 123 | return tpl[0] * x + tpl[1] |
| 124 | |
| 125 | def error_func_rel(tpl, x, y): |
| 126 | return 1.0 - y / func_line(tpl, x) |
| 127 | |
| 128 | def error_func_abs(tpl, x, y): |
| 129 | return y - func_line(tpl, x) |
| 130 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 131 | # choose distance mode |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 132 | error_func = error_func_rel if relative_dist else error_func_abs |
| 133 | |
| 134 | tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]], |
| 135 | oy[:2])) |
| 136 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 137 | # find line |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 138 | tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy)) |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 139 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 140 | # if error |
| 141 | if success not in range(1, 5): |
| 142 | raise ValueError("No line for this dots") |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 143 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 144 | # return new dots |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 145 | return func_line(tpl_final, numpy.array(xnew)) |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 146 | |
| 147 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 148 | # TODO: revise next |
| 149 | # def difference(y, ynew): |
| 150 | # """returns average and maximum relative and |
| 151 | # absolute differences between y and ynew |
| 152 | # result may contain None values for y = 0 |
| 153 | # return value - tuple: |
| 154 | # [(abs dif, rel dif) * len(y)], |
| 155 | # (abs average, abs max), |
| 156 | # (rel average, rel max)""" |
| 157 | # |
| 158 | # abs_dlist = [] |
| 159 | # rel_dlist = [] |
| 160 | # |
| 161 | # for y1, y2 in zip(y, ynew): |
| 162 | # # absolute |
| 163 | # abs_dlist.append(y1 - y2) |
| 164 | # |
| 165 | # if y1 > 1E-6: |
| 166 | # rel_dlist.append(abs(abs_dlist[-1] / y1)) |
| 167 | # else: |
| 168 | # raise ZeroDivisionError("{0!r} is too small".format(y1)) |
| 169 | # |
| 170 | # da_avg = sum(abs_dlist) / len(abs_dlist) |
| 171 | # dr_avg = sum(rel_dlist) / len(rel_dlist) |
| 172 | # |
| 173 | # return (zip(abs_dlist, rel_dlist), |
| 174 | # (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist)) |
| 175 | # ) |