koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 1 | import math |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 2 | import logging |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 3 | import itertools |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 4 | from typing import List, Callable, Iterable, cast, Tuple |
koder aka kdanilov | cff7b2e | 2015-04-18 20:48:15 +0300 | [diff] [blame] | 5 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 6 | import numpy |
| 7 | from scipy import stats, optimize |
| 8 | from numpy import linalg |
| 9 | from numpy.polynomial.chebyshev import chebfit, chebval |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 10 | |
| 11 | |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 12 | from .result_classes import NormStatProps, HistoStatProps, TimeSeries |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 13 | from .utils import Number |
koder aka kdanilov | bb6d6cd | 2015-06-20 02:55:07 +0300 | [diff] [blame] | 14 | |
| 15 | |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 16 | logger = logging.getLogger("wally") |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 17 | DOUBLE_DELTA = 1e-8 |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 18 | MIN_VALUES_FOR_CONFIDENCE = 7 |
koder aka kdanilov | e87ae65 | 2015-04-20 02:14:35 +0300 | [diff] [blame] | 19 | |
| 20 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 21 | average = numpy.mean |
| 22 | dev = lambda x: math.sqrt(numpy.var(x, ddof=1)) |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 23 | |
| 24 | |
kdanylov aka koder | 150b219 | 2017-04-01 16:53:01 +0300 | [diff] [blame] | 25 | def calc_norm_stat_props(ts: TimeSeries, bins_count: int = None, confidence: float = 0.95) -> NormStatProps: |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 26 | "Calculate statistical properties of array of numbers" |
| 27 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 28 | # array.array has very basic support |
| 29 | data = cast(List[int], ts.data) |
kdanylov aka koder | 4518318 | 2017-04-30 23:55:40 +0300 | [diff] [blame^] | 30 | res = NormStatProps(data, ts.units) # type: ignore |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 31 | |
| 32 | if len(data) == 0: |
| 33 | raise ValueError("Input array is empty") |
| 34 | |
| 35 | data = sorted(data) |
| 36 | res.average = average(data) |
| 37 | res.deviation = dev(data) |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 38 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 39 | res.max = data[-1] |
| 40 | res.min = data[0] |
| 41 | |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 42 | pcs = numpy.percentile(data, q=[1.0, 5.0, 10., 50., 90., 95., 99.]) |
| 43 | res.perc_1, res.perc_5, res.perc_10, res.perc_50, res.perc_90, res.perc_95, res.perc_99 = pcs |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 44 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 45 | if len(data) >= MIN_VALUES_FOR_CONFIDENCE: |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 46 | res.confidence = stats.sem(data) * \ |
| 47 | stats.t.ppf((1 + confidence) / 2, len(data) - 1) |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 48 | res.confidence_level = confidence |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 49 | else: |
| 50 | res.confidence = None |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 51 | res.confidence_level = None |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 52 | |
kdanylov aka koder | 150b219 | 2017-04-01 16:53:01 +0300 | [diff] [blame] | 53 | if bins_count is not None: |
| 54 | res.bins_populations, res.bins_edges = numpy.histogram(data, bins=bins_count) |
| 55 | res.bins_edges = res.bins_edges[:-1] |
koder aka kdanilov | ffaf48d | 2016-12-27 02:25:29 +0200 | [diff] [blame] | 56 | |
| 57 | try: |
| 58 | res.normtest = stats.mstats.normaltest(data) |
| 59 | except Exception as exc: |
| 60 | logger.warning("stats.mstats.normaltest failed with error: %s", exc) |
| 61 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 62 | res.skew = stats.skew(data) |
| 63 | res.kurt = stats.kurtosis(data) |
| 64 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 65 | return res |
| 66 | |
| 67 | |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 68 | # update this code |
| 69 | def rebin_histogram(bins_populations: numpy.array, |
| 70 | bins_edges: numpy.array, |
| 71 | new_bins_count: int, |
| 72 | left_tail_idx: int = None, |
| 73 | right_tail_idx: int = None, |
| 74 | log_bins: bool = False) -> Tuple[numpy.array, numpy.array]: |
| 75 | # rebin large histogram into smaller with new_bins bins, linearly distributes across |
| 76 | # left_tail_idx:right_tail_idx range |
| 77 | |
| 78 | assert len(bins_populations.shape) == 1 |
| 79 | assert len(bins_edges.shape) == 1 |
| 80 | assert bins_edges.shape[0] == bins_populations.shape[0] |
| 81 | |
| 82 | if left_tail_idx is None: |
| 83 | min_val = bins_edges[0] |
| 84 | else: |
| 85 | min_val = bins_edges[left_tail_idx] |
| 86 | |
| 87 | if right_tail_idx is None: |
| 88 | max_val = bins_edges[-1] |
| 89 | else: |
| 90 | max_val = bins_edges[right_tail_idx] |
| 91 | |
| 92 | if log_bins: |
| 93 | assert min_val > 1E-3 |
| 94 | step = (max_val / min_val) ** (1 / new_bins_count) |
| 95 | new_bins_edges = min_val * (step ** numpy.arange(new_bins_count)) # type: numpy.array |
| 96 | else: |
| 97 | new_bins_edges = numpy.linspace(min_val, max_val, new_bins_count + 1, dtype='float')[:-1] # type: numpy.array |
| 98 | |
| 99 | old_bins_pos = numpy.searchsorted(new_bins_edges, bins_edges, side='right') |
| 100 | new_bins = numpy.zeros(new_bins_count, dtype=int) # type: numpy.array |
| 101 | |
| 102 | # last source bin can't be split |
| 103 | # TODO: need to add assert for this |
| 104 | new_bins[-1] += bins_populations[-1] |
| 105 | bin_sizes = bins_edges[1:] - bins_edges[:-1] |
| 106 | |
| 107 | # correct position to get bin idx from edge idx |
| 108 | old_bins_pos -= 1 |
| 109 | old_bins_pos[old_bins_pos < 0] = 0 |
| 110 | new_bins_sizes = new_bins_edges[1:] - new_bins_edges[:-1] |
| 111 | |
| 112 | for population, begin, end, bsize in zip(bins_populations[:-1], old_bins_pos[:-1], old_bins_pos[1:], bin_sizes): |
| 113 | if begin == end: |
| 114 | new_bins[begin] += population |
| 115 | else: |
| 116 | density = population / bsize |
| 117 | for curr_box in range(begin, end): |
| 118 | cnt = min(int(new_bins_sizes[begin] * density + 0.5), population) |
| 119 | new_bins[begin] += cnt |
| 120 | population -= cnt |
| 121 | |
| 122 | return new_bins, new_bins_edges |
| 123 | |
| 124 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 125 | def calc_histo_stat_props(ts: TimeSeries, |
kdanylov aka koder | cdfcdaf | 2017-04-29 10:03:39 +0300 | [diff] [blame] | 126 | bins_edges: numpy.array = None, |
kdanylov aka koder | 150b219 | 2017-04-01 16:53:01 +0300 | [diff] [blame] | 127 | rebins_count: int = None, |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 128 | tail: float = 0.005) -> HistoStatProps: |
kdanylov aka koder | cdfcdaf | 2017-04-29 10:03:39 +0300 | [diff] [blame] | 129 | if bins_edges is None: |
| 130 | bins_edges = ts.histo_bins |
| 131 | |
kdanylov aka koder | 4518318 | 2017-04-30 23:55:40 +0300 | [diff] [blame^] | 132 | res = HistoStatProps(ts.data, ts.units) |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 133 | |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 134 | # summ across all series |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 135 | aggregated = ts.data.sum(axis=0, dtype='int') |
| 136 | total = aggregated.sum() |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 137 | |
| 138 | # percentiles levels |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 139 | expected = list(numpy.array([0.01, 0.05, 0.1, 0.5, 0.9, 0.95, 0.99]) * total) |
| 140 | cumsum = numpy.cumsum(aggregated) |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 141 | |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 142 | percentiles_bins = numpy.searchsorted(cumsum, expected) |
| 143 | percentiles = bins_edges[percentiles_bins] |
| 144 | res.perc_1, res.perc_5, res.perc_10, res.perc_50, res.perc_90, res.perc_95, res.perc_99 = percentiles |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 145 | |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 146 | # don't show tail ranges on histogram |
| 147 | left_tail_idx, right_tail_idx = numpy.searchsorted(cumsum, [tail * total, (1 - tail) * total]) |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 148 | |
| 149 | # minimax and maximal non-zero elements |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 150 | non_zero = numpy.nonzero(aggregated)[0] |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 151 | res.min = bins_edges[aggregated[non_zero[0]]] |
| 152 | res.max = bins_edges[non_zero[-1] + (1 if non_zero[-1] != len(bins_edges) else 0)] |
| 153 | |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 154 | res.log_bins = False |
kdanylov aka koder | 150b219 | 2017-04-01 16:53:01 +0300 | [diff] [blame] | 155 | if rebins_count is not None: |
| 156 | res.bins_populations, res.bins_edges = rebin_histogram(aggregated, bins_edges, rebins_count, |
| 157 | left_tail_idx, right_tail_idx) |
| 158 | else: |
| 159 | res.bins_populations = aggregated |
| 160 | res.bins_edges = bins_edges.copy() |
koder aka kdanilov | 108ac36 | 2017-01-19 20:17:16 +0200 | [diff] [blame] | 161 | |
koder aka kdanilov | f286517 | 2016-12-30 03:35:11 +0200 | [diff] [blame] | 162 | return res |
| 163 | |
| 164 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 165 | def groupby_globally(data: Iterable, key_func: Callable): |
| 166 | grouped = {} # type: ignore |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 167 | grouped_iter = itertools.groupby(data, key_func) |
| 168 | |
| 169 | for (bs, cache_tp, act, conc), curr_data_it in grouped_iter: |
| 170 | key = (bs, cache_tp, act, conc) |
| 171 | grouped.setdefault(key, []).extend(curr_data_it) |
| 172 | |
| 173 | return grouped |
| 174 | |
| 175 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 176 | def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]: |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 177 | """returns ynew - y values of some curve approximation""" |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 178 | return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef))) |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 179 | |
| 180 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 181 | def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]: |
| 182 | """ |
| 183 | x, y - test data, xnew - dots, where we want find approximation |
| 184 | if not relative_dist distance = y - newy |
| 185 | returns ynew - y values of linear approximation |
| 186 | """ |
| 187 | ox = numpy.array(x) |
| 188 | oy = numpy.array(y) |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 189 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 190 | # set approximation function |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 191 | def func_line(tpl, x): |
| 192 | return tpl[0] * x + tpl[1] |
| 193 | |
| 194 | def error_func_rel(tpl, x, y): |
| 195 | return 1.0 - y / func_line(tpl, x) |
| 196 | |
| 197 | def error_func_abs(tpl, x, y): |
| 198 | return y - func_line(tpl, x) |
| 199 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 200 | # choose distance mode |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 201 | error_func = error_func_rel if relative_dist else error_func_abs |
| 202 | |
| 203 | tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]], |
| 204 | oy[:2])) |
| 205 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 206 | # find line |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 207 | tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy)) |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 208 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 209 | # if error |
| 210 | if success not in range(1, 5): |
| 211 | raise ValueError("No line for this dots") |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 212 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 213 | # return new dots |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 214 | return func_line(tpl_final, numpy.array(xnew)) |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 215 | |
| 216 | |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 217 | def moving_average(data: numpy.array, window: int) -> numpy.array: |
| 218 | cumsum = numpy.cumsum(data) |
| 219 | cumsum[window:] = cumsum[window:] - cumsum[:-window] |
| 220 | return cumsum[window - 1:] / window |
| 221 | |
| 222 | |
| 223 | def moving_dev(data: numpy.array, window: int) -> numpy.array: |
| 224 | cumsum = numpy.cumsum(data) |
| 225 | cumsum2 = numpy.cumsum(data ** 2) |
| 226 | cumsum[window:] = cumsum[window:] - cumsum[:-window] |
| 227 | cumsum2[window:] = cumsum2[window:] - cumsum2[:-window] |
| 228 | return ((cumsum2[window - 1:] - cumsum[window - 1:] ** 2 / window) / (window - 1)) ** 0.5 |
| 229 | |
| 230 | |
| 231 | def find_ouliers(data: numpy.array, |
| 232 | center_range: Tuple[int, int] = (25, 75), |
kdanylov aka koder | 0e0cfcb | 2017-03-27 22:19:09 +0300 | [diff] [blame] | 233 | cut_range: float = 3.0) -> numpy.array: |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 234 | v1, v2 = numpy.percentile(data, center_range) |
| 235 | return numpy.abs(data - (v1 + v2) / 2) > ((v2 - v1) / 2 * cut_range) |
| 236 | |
| 237 | |
| 238 | def find_ouliers_ts(data: numpy.array, |
| 239 | windows_size: int = 30, |
| 240 | center_range: Tuple[int, int] = (25, 75), |
kdanylov aka koder | 0e0cfcb | 2017-03-27 22:19:09 +0300 | [diff] [blame] | 241 | cut_range: float = 3.0) -> numpy.array: |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 242 | outliers = numpy.empty(data.shape, dtype=bool) |
| 243 | |
| 244 | if len(data) < windows_size: |
| 245 | outliers[:] = False |
| 246 | return outliers |
| 247 | |
| 248 | begin_idx = 0 |
| 249 | if len(data) < windows_size * 2: |
| 250 | end_idx = (len(data) % windows_size) // 2 + windows_size |
| 251 | else: |
| 252 | end_idx = len(data) |
| 253 | |
| 254 | while True: |
| 255 | cdata = data[begin_idx: end_idx] |
| 256 | outliers[begin_idx: end_idx] = find_ouliers(cdata, center_range, cut_range) |
| 257 | begin_idx = end_idx |
| 258 | |
| 259 | if end_idx == len(data): |
| 260 | break |
| 261 | |
| 262 | end_idx += windows_size |
| 263 | if len(data) - end_idx < windows_size: |
| 264 | end_idx = len(data) |
| 265 | |
| 266 | return outliers |
| 267 | |
| 268 | |
| 269 | def hist_outliers_nd(bin_populations: numpy.array, |
| 270 | bin_centers: numpy.array, |
| 271 | center_range: Tuple[int, int] = (25, 75), |
| 272 | cut_range: float = 3.0) -> Tuple[int, int]: |
| 273 | assert len(bin_populations) == len(bin_centers) |
| 274 | total_count = bin_populations.sum() |
| 275 | |
| 276 | perc25 = total_count / 100.0 * center_range[0] |
| 277 | perc75 = total_count / 100.0 * center_range[1] |
| 278 | |
| 279 | perc25_idx, perc75_idx = numpy.searchsorted(numpy.cumsum(bin_populations), [perc25, perc75]) |
| 280 | middle = (bin_centers[perc75_idx] + bin_centers[perc25_idx]) / 2 |
| 281 | r = (bin_centers[perc75_idx] - bin_centers[perc25_idx]) / 2 |
| 282 | |
| 283 | lower_bound = middle - r * cut_range |
| 284 | upper_bound = middle + r * cut_range |
| 285 | |
| 286 | lower_cut_idx, upper_cut_idx = numpy.searchsorted(bin_centers, [lower_bound, upper_bound]) |
| 287 | return lower_cut_idx, upper_cut_idx |
| 288 | |
| 289 | |
| 290 | def hist_outliers_perc(bin_populations: numpy.array, |
kdanylov aka koder | cdfcdaf | 2017-04-29 10:03:39 +0300 | [diff] [blame] | 291 | bounds_perc: Tuple[float, float] = (0.01, 0.99), |
| 292 | min_bins_left: int = None) -> Tuple[int, int]: |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 293 | assert len(bin_populations.shape) == 1 |
| 294 | total_count = bin_populations.sum() |
| 295 | lower_perc = total_count * bounds_perc[0] |
| 296 | upper_perc = total_count * bounds_perc[1] |
kdanylov aka koder | cdfcdaf | 2017-04-29 10:03:39 +0300 | [diff] [blame] | 297 | idx1, idx2 = numpy.searchsorted(numpy.cumsum(bin_populations), [lower_perc, upper_perc]) |
| 298 | |
| 299 | # don't cut too many bins. At least min_bins_left must left |
| 300 | if min_bins_left is not None and idx2 - idx1 < min_bins_left: |
| 301 | missed = min_bins_left - (idx2 - idx1) // 2 |
| 302 | idx2 = min(len(bin_populations), idx2 + missed) |
| 303 | idx1 = max(0, idx1 - missed) |
| 304 | |
| 305 | return idx1, idx2 |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 306 | |
| 307 | |
| 308 | def ts_hist_outliers_perc(bin_populations: numpy.array, |
| 309 | window_size: int = 10, |
kdanylov aka koder | cdfcdaf | 2017-04-29 10:03:39 +0300 | [diff] [blame] | 310 | bounds_perc: Tuple[float, float] = (0.01, 0.99), |
| 311 | min_bins_left: int = None) -> Tuple[int, int]: |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 312 | assert len(bin_populations.shape) == 2 |
| 313 | |
| 314 | points = list(range(0, len(bin_populations), window_size)) |
| 315 | if len(bin_populations) % window_size != 0: |
| 316 | points.append(points[-1] + window_size) |
| 317 | |
kdanylov aka koder | cdfcdaf | 2017-04-29 10:03:39 +0300 | [diff] [blame] | 318 | ranges = [] # type: List[List[int]] |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 319 | for begin, end in zip(points[:-1], points[1:]): |
| 320 | window_hist = bin_populations[begin:end].sum(axis=0) |
kdanylov aka koder | cdfcdaf | 2017-04-29 10:03:39 +0300 | [diff] [blame] | 321 | ranges.append(hist_outliers_perc(window_hist, bounds_perc=bounds_perc, min_bins_left=min_bins_left)) |
koder aka kdanilov | a732a60 | 2017-02-01 20:29:56 +0200 | [diff] [blame] | 322 | |
| 323 | return min(i[0] for i in ranges), max(i[1] for i in ranges) |
| 324 | |
| 325 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame] | 326 | # TODO: revise next |
| 327 | # def difference(y, ynew): |
| 328 | # """returns average and maximum relative and |
| 329 | # absolute differences between y and ynew |
| 330 | # result may contain None values for y = 0 |
| 331 | # return value - tuple: |
| 332 | # [(abs dif, rel dif) * len(y)], |
| 333 | # (abs average, abs max), |
| 334 | # (rel average, rel max)""" |
| 335 | # |
| 336 | # abs_dlist = [] |
| 337 | # rel_dlist = [] |
| 338 | # |
| 339 | # for y1, y2 in zip(y, ynew): |
| 340 | # # absolute |
| 341 | # abs_dlist.append(y1 - y2) |
| 342 | # |
| 343 | # if y1 > 1E-6: |
| 344 | # rel_dlist.append(abs(abs_dlist[-1] / y1)) |
| 345 | # else: |
| 346 | # raise ZeroDivisionError("{0!r} is too small".format(y1)) |
| 347 | # |
| 348 | # da_avg = sum(abs_dlist) / len(abs_dlist) |
| 349 | # dr_avg = sum(rel_dlist) / len(rel_dlist) |
| 350 | # |
| 351 | # return (zip(abs_dlist, rel_dlist), |
| 352 | # (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist)) |
| 353 | # ) |