koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 1 | import math |
| 2 | import itertools |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame^] | 3 | import statistics |
| 4 | from typing import Union, List, TypeVar, Callable, Iterable, Tuple, Any, cast, Dict |
koder aka kdanilov | cff7b2e | 2015-04-18 20:48:15 +0300 | [diff] [blame] | 5 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame^] | 6 | import numpy |
| 7 | from scipy import stats, optimize |
| 8 | from numpy import linalg |
| 9 | from numpy.polynomial.chebyshev import chebfit, chebval |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 10 | |
| 11 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame^] | 12 | from .result_classes import IStorable |
koder aka kdanilov | bb6d6cd | 2015-06-20 02:55:07 +0300 | [diff] [blame] | 13 | |
| 14 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame^] | 15 | Number = Union[int, float] |
| 16 | TNumber = TypeVar('TNumber', int, float) |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 17 | |
| 18 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame^] | 19 | DOUBLE_DELTA = 1e-8 |
koder aka kdanilov | e87ae65 | 2015-04-20 02:14:35 +0300 | [diff] [blame] | 20 | |
| 21 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame^] | 22 | average = statistics.mean |
| 23 | dev = statistics.variance |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 24 | |
| 25 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame^] | 26 | class StatProps(IStorable): |
| 27 | "Statistic properties for timeserie" |
| 28 | |
| 29 | yaml_tag = 'stat' |
| 30 | |
| 31 | def __init__(self, data: List[Number]) -> None: |
| 32 | self.average = None # type: float |
| 33 | self.deviation = None # type: float |
| 34 | self.confidence = None # type: float |
| 35 | self.confidence_level = None # type: float |
| 36 | |
| 37 | self.perc_99 = None # type: float |
| 38 | self.perc_95 = None # type: float |
| 39 | self.perc_90 = None # type: float |
| 40 | self.perc_50 = None # type: float |
| 41 | |
| 42 | self.min = None # type: Number |
| 43 | self.max = None # type: Number |
| 44 | |
| 45 | # bin_center: bin_count |
| 46 | self.histo = None # type: Tuple[List[int], List[float]] |
| 47 | self.data = data |
| 48 | |
| 49 | self.normtest = None # type: Any |
| 50 | |
| 51 | def __str__(self) -> str: |
| 52 | res = ["StatProps(num_el={}):".format(len(self.data)), |
| 53 | " distr = {0.average} ~ {0.deviation}".format(self), |
| 54 | " confidence({0.confidence_level}) = {0.confidence}".format(self), |
| 55 | " perc50={0.perc50}".format(self), |
| 56 | " perc90={0.perc90}".format(self), |
| 57 | " perc95={0.perc95}".format(self), |
| 58 | " perc95={0.perc99}".format(self), |
| 59 | " range {0.min} {0.max}".format(self), |
| 60 | " nurmtest = {0.nortest}".format(self)] |
| 61 | return "\n".join(res) |
| 62 | |
| 63 | def __repr__(self) -> str: |
| 64 | return str(self) |
| 65 | |
| 66 | def raw(self) -> Dict[str, Any]: |
| 67 | return self.__dict__.copy() |
| 68 | |
| 69 | @classmethod |
| 70 | def fromraw(cls, data: Dict[str, Any]) -> 'StatProps': |
| 71 | res = cls.__new__(cls) |
| 72 | res.__dict__.update(data) |
| 73 | return res |
| 74 | |
| 75 | |
| 76 | def greater_digit_pos(val: Number) -> int: |
| 77 | return int(math.floor(math.log10(val))) + 1 |
| 78 | |
| 79 | |
| 80 | def round_digits(val: TNumber, num_digits: int = 3) -> TNumber: |
| 81 | pow = 10 ** (greater_digit_pos(val) - num_digits) |
| 82 | return type(val)(int(val / pow) * pow) |
| 83 | |
| 84 | |
| 85 | def calc_stat_props(data: List[Number], confidence: float = 0.95) -> StatProps: |
| 86 | "Calculate statistical properties of array of numbers" |
| 87 | |
| 88 | res = StatProps(data) |
| 89 | |
| 90 | if len(data) == 0: |
| 91 | raise ValueError("Input array is empty") |
| 92 | |
| 93 | data = sorted(data) |
| 94 | res.average = average(data) |
| 95 | res.deviation = dev(data) |
| 96 | res.max = data[-1] |
| 97 | res.min = data[0] |
| 98 | |
| 99 | res.perc_50 = numpy.percentile(data, 50) |
| 100 | res.perc_90 = numpy.percentile(data, 90) |
| 101 | res.perc_95 = numpy.percentile(data, 95) |
| 102 | res.perc_99 = numpy.percentile(data, 99) |
| 103 | |
| 104 | if len(data) >= 3: |
| 105 | res.confidence = stats.sem(data) * \ |
| 106 | stats.t.ppf((1 + confidence) / 2, len(data) - 1) |
| 107 | else: |
| 108 | res.confidence = None |
| 109 | |
| 110 | res.histo = numpy.histogram(data, 'auto') |
| 111 | res.normtest = stats.mstats.normaltest(data) |
| 112 | return res |
| 113 | |
| 114 | |
| 115 | def groupby_globally(data: Iterable, key_func: Callable): |
| 116 | grouped = {} # type: ignore |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 117 | grouped_iter = itertools.groupby(data, key_func) |
| 118 | |
| 119 | for (bs, cache_tp, act, conc), curr_data_it in grouped_iter: |
| 120 | key = (bs, cache_tp, act, conc) |
| 121 | grouped.setdefault(key, []).extend(curr_data_it) |
| 122 | |
| 123 | return grouped |
| 124 | |
| 125 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame^] | 126 | def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]: |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 127 | """returns ynew - y values of some curve approximation""" |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame^] | 128 | return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef))) |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 129 | |
| 130 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame^] | 131 | def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]: |
| 132 | """ |
| 133 | x, y - test data, xnew - dots, where we want find approximation |
| 134 | if not relative_dist distance = y - newy |
| 135 | returns ynew - y values of linear approximation |
| 136 | """ |
| 137 | ox = numpy.array(x) |
| 138 | oy = numpy.array(y) |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 139 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 140 | # set approximation function |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 141 | def func_line(tpl, x): |
| 142 | return tpl[0] * x + tpl[1] |
| 143 | |
| 144 | def error_func_rel(tpl, x, y): |
| 145 | return 1.0 - y / func_line(tpl, x) |
| 146 | |
| 147 | def error_func_abs(tpl, x, y): |
| 148 | return y - func_line(tpl, x) |
| 149 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 150 | # choose distance mode |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 151 | error_func = error_func_rel if relative_dist else error_func_abs |
| 152 | |
| 153 | tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]], |
| 154 | oy[:2])) |
| 155 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 156 | # find line |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame^] | 157 | tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy)) |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 158 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 159 | # if error |
| 160 | if success not in range(1, 5): |
| 161 | raise ValueError("No line for this dots") |
koder aka kdanilov | 66839a9 | 2015-04-11 13:22:31 +0300 | [diff] [blame] | 162 | |
Ved-vampir | 0316644 | 2015-04-10 17:28:23 +0300 | [diff] [blame] | 163 | # return new dots |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame^] | 164 | return func_line(tpl_final, numpy.array(xnew)) |
koder aka kdanilov | 6c49106 | 2015-04-09 22:33:13 +0300 | [diff] [blame] | 165 | |
| 166 | |
koder aka kdanilov | 7f59d56 | 2016-12-26 01:34:23 +0200 | [diff] [blame^] | 167 | # TODO: revise next |
| 168 | # def difference(y, ynew): |
| 169 | # """returns average and maximum relative and |
| 170 | # absolute differences between y and ynew |
| 171 | # result may contain None values for y = 0 |
| 172 | # return value - tuple: |
| 173 | # [(abs dif, rel dif) * len(y)], |
| 174 | # (abs average, abs max), |
| 175 | # (rel average, rel max)""" |
| 176 | # |
| 177 | # abs_dlist = [] |
| 178 | # rel_dlist = [] |
| 179 | # |
| 180 | # for y1, y2 in zip(y, ynew): |
| 181 | # # absolute |
| 182 | # abs_dlist.append(y1 - y2) |
| 183 | # |
| 184 | # if y1 > 1E-6: |
| 185 | # rel_dlist.append(abs(abs_dlist[-1] / y1)) |
| 186 | # else: |
| 187 | # raise ZeroDivisionError("{0!r} is too small".format(y1)) |
| 188 | # |
| 189 | # da_avg = sum(abs_dlist) / len(abs_dlist) |
| 190 | # dr_avg = sum(rel_dlist) / len(rel_dlist) |
| 191 | # |
| 192 | # return (zip(abs_dlist, rel_dlist), |
| 193 | # (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist)) |
| 194 | # ) |