koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 1 | import math |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 2 | import logging |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 3 | import itertools |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 4 | import statistics |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 5 | from typing import List, Callable, Iterable, cast |
koder aka kdanilov | cff7b2e | 2015-04-18 20:48:15 +0300 | [diff] [blame] | 6 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 7 | import numpy |
| 8 | from scipy import stats, optimize |
| 9 | from numpy import linalg |
| 10 | from numpy.polynomial.chebyshev import chebfit, chebval |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 11 | |
| 12 | |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 13 | from .result_classes import NormStatProps, HistoStatProps, TimeSeries |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 14 | from .utils import Number |
koder aka kdanilov | bb6d6cd | 2015-06-20 02:55:07 +0300 | [diff] [blame] | 15 | |
| 16 | |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 17 | logger = logging.getLogger("wally") |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 18 | DOUBLE_DELTA = 1e-8 |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 19 | MIN_VALUES_FOR_CONFIDENCE = 7 |
koder aka kdanilov | e87ae65 | 2015-04-20 02:14:35 +0300 | [diff] [blame] | 20 | |
| 21 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 22 | average = numpy.mean |
| 23 | dev = lambda x: math.sqrt(numpy.var(x, ddof=1)) |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 24 | |
| 25 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 26 | def calc_norm_stat_props(ts: TimeSeries, bins_count: int, confidence: float = 0.95) -> NormStatProps: |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 27 | "Calculate statistical properties of array of numbers" |
| 28 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 29 | # array.array has very basic support |
| 30 | data = cast(List[int], ts.data) |
| 31 | res = NormStatProps(data) # type: ignore |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 32 | |
| 33 | if len(data) == 0: |
| 34 | raise ValueError("Input array is empty") |
| 35 | |
| 36 | data = sorted(data) |
| 37 | res.average = average(data) |
| 38 | res.deviation = dev(data) |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 39 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 40 | res.max = data[-1] |
| 41 | res.min = data[0] |
| 42 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 43 | res.perc_50, res.perc_90, res.perc_99, res.perc_99 = numpy.percentile(data, q=[50., 90., 95., 99.]) |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 44 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 45 | if len(data) >= MIN_VALUES_FOR_CONFIDENCE: |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 46 | res.confidence = stats.sem(data) * \ |
| 47 | stats.t.ppf((1 + confidence) / 2, len(data) - 1) |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 48 | res.confidence_level = confidence |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 49 | else: |
| 50 | res.confidence = None |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 51 | res.confidence_level = None |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 52 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 53 | res.bins_populations, bins_edges = numpy.histogram(data, bins=bins_count) |
| 54 | res.bins_mids = (bins_edges[:-1] + bins_edges[1:]) / 2 |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 55 | |
| 56 | try: |
| 57 | res.normtest = stats.mstats.normaltest(data) |
| 58 | except Exception as exc: |
| 59 | logger.warning("stats.mstats.normaltest failed with error: %s", exc) |
| 60 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 61 | res.skew = stats.skew(data) |
| 62 | res.kurt = stats.kurtosis(data) |
| 63 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 64 | return res |
| 65 | |
| 66 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 67 | def calc_histo_stat_props(ts: TimeSeries, |
| 68 | bins_edges: numpy.array, |
| 69 | bins_count: int, |
| 70 | min_valuable: float = 0.0001) -> HistoStatProps: |
| 71 | data = numpy.array(ts.data, dtype='int') |
| 72 | data.shape = [len(ts.data) // ts.second_axis_size, ts.second_axis_size] # type: ignore |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 73 | |
| 74 | res = HistoStatProps(ts.data, ts.second_axis_size) |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 75 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 76 | # summ across all series |
| 77 | aggregated = numpy.sum(data, axis=0, dtype='int') |
| 78 | total = numpy.sum(aggregated) |
| 79 | |
| 80 | # minimal value used for histo |
| 81 | min_val_on_histo = total * min_valuable |
| 82 | |
| 83 | # percentiles levels |
| 84 | expected = [total * 0.5, total * 0.9, total * 0.95, total * 0.99] |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 85 | percentiles = [] |
| 86 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 87 | # all indexes, where values greater than min_val_on_histo |
| 88 | valuable_idxs = [] |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 89 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 90 | curr_summ = 0 |
| 91 | non_zero = aggregated.nonzero()[0] |
| 92 | |
| 93 | # calculate percentiles and valuable_indexes |
| 94 | for idx in non_zero: |
| 95 | val = aggregated[idx] |
| 96 | while expected and curr_summ + val >= expected[0]: |
| 97 | percentiles.append(bins_edges[idx]) |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 98 | del expected[0] |
| 99 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 100 | curr_summ += val |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 101 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 102 | if val >= min_val_on_histo: |
| 103 | valuable_idxs.append(idx) |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 104 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 105 | res.perc_50, res.perc_90, res.perc_95, res.perc_99 = percentiles |
| 106 | |
| 107 | # minimax and maximal non-zero elements |
| 108 | res.min = bins_edges[aggregated[non_zero[0]]] |
| 109 | res.max = bins_edges[non_zero[-1] + (1 if non_zero[-1] != len(bins_edges) else 0)] |
| 110 | |
| 111 | # minimal and maximal valueble evelemts |
| 112 | val_idx_min = valuable_idxs[0] |
| 113 | val_idx_max = valuable_idxs[-1] |
| 114 | |
| 115 | raw_bins_populations = aggregated[val_idx_min: val_idx_max + 1] |
| 116 | raw_bins_edges = bins_edges[val_idx_min: val_idx_max + 2] |
| 117 | raw_bins_mids = cast(numpy.array, (raw_bins_edges[1:] + raw_bins_edges[:-1]) / 2) |
| 118 | |
| 119 | step = (raw_bins_mids[-1] + raw_bins_mids[0]) / bins_count |
| 120 | next = raw_bins_mids[0] |
| 121 | |
| 122 | # aggregate raw histogram with many bins into result histogram with bins_count bins |
| 123 | cidx = 0 |
| 124 | bins_populations = [] |
| 125 | bins_mids = [] |
| 126 | |
| 127 | while cidx < len(raw_bins_mids): |
| 128 | next += step |
| 129 | bin_population = 0 |
| 130 | |
| 131 | while cidx < len(raw_bins_mids) and raw_bins_mids[cidx] <= next: |
| 132 | bin_population += raw_bins_populations[cidx] |
| 133 | cidx += 1 |
| 134 | |
| 135 | bins_populations.append(bin_population) |
| 136 | bins_mids.append(next - step / 2) |
| 137 | |
| 138 | res.bins_populations = numpy.array(bins_populations, dtype='int') |
| 139 | res.bins_mids = numpy.array(bins_mids, dtype='float32') |
| 140 | |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 141 | return res |
| 142 | |
| 143 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 144 | def groupby_globally(data: Iterable, key_func: Callable): |
| 145 | grouped = {} # type: ignore |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 146 | grouped_iter = itertools.groupby(data, key_func) |
| 147 | |
| 148 | for (bs, cache_tp, act, conc), curr_data_it in grouped_iter: |
| 149 | key = (bs, cache_tp, act, conc) |
| 150 | grouped.setdefault(key, []).extend(curr_data_it) |
| 151 | |
| 152 | return grouped |
| 153 | |
| 154 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 155 | def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]: |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 156 | """returns ynew - y values of some curve approximation""" |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 157 | return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef))) |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 158 | |
| 159 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 160 | def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]: |
| 161 | """ |
| 162 | x, y - test data, xnew - dots, where we want find approximation |
| 163 | if not relative_dist distance = y - newy |
| 164 | returns ynew - y values of linear approximation |
| 165 | """ |
| 166 | ox = numpy.array(x) |
| 167 | oy = numpy.array(y) |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 168 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 169 | # set approximation function |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 170 | def func_line(tpl, x): |
| 171 | return tpl[0] * x + tpl[1] |
| 172 | |
| 173 | def error_func_rel(tpl, x, y): |
| 174 | return 1.0 - y / func_line(tpl, x) |
| 175 | |
| 176 | def error_func_abs(tpl, x, y): |
| 177 | return y - func_line(tpl, x) |
| 178 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 179 | # choose distance mode |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 180 | error_func = error_func_rel if relative_dist else error_func_abs |
| 181 | |
| 182 | tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]], |
| 183 | oy[:2])) |
| 184 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 185 | # find line |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 186 | tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy)) |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 187 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 188 | # if error |
| 189 | if success not in range(1, 5): |
| 190 | raise ValueError("No line for this dots") |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 191 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 192 | # return new dots |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 193 | return func_line(tpl_final, numpy.array(xnew)) |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 194 | |
| 195 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 196 | # TODO: revise next |
| 197 | # def difference(y, ynew): |
| 198 | # """returns average and maximum relative and |
| 199 | # absolute differences between y and ynew |
| 200 | # result may contain None values for y = 0 |
| 201 | # return value - tuple: |
| 202 | # [(abs dif, rel dif) * len(y)], |
| 203 | # (abs average, abs max), |
| 204 | # (rel average, rel max)""" |
| 205 | # |
| 206 | # abs_dlist = [] |
| 207 | # rel_dlist = [] |
| 208 | # |
| 209 | # for y1, y2 in zip(y, ynew): |
| 210 | # # absolute |
| 211 | # abs_dlist.append(y1 - y2) |
| 212 | # |
| 213 | # if y1 > 1E-6: |
| 214 | # rel_dlist.append(abs(abs_dlist[-1] / y1)) |
| 215 | # else: |
| 216 | # raise ZeroDivisionError("{0!r} is too small".format(y1)) |
| 217 | # |
| 218 | # da_avg = sum(abs_dlist) / len(abs_dlist) |
| 219 | # dr_avg = sum(rel_dlist) / len(rel_dlist) |
| 220 | # |
| 221 | # return (zip(abs_dlist, rel_dlist), |
| 222 | # (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist)) |
| 223 | # ) |