Blame - wally/statistic.py - mcp/cvp-wally

blob: 047f86d4817bda0a040c86da494733a147fccfc8 [file] [log] [blame]

koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	1	import math
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	2	import logging
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	3	import itertools
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	4	from typing import List, Callable, Iterable, cast, Tuple
koder aka kdanilov	cff7b2e	2015-04-18 20:48:15 +0300	[diff] [blame]	5
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	6	import numpy
				7	from scipy import stats, optimize
				8	from numpy import linalg
				9	from numpy.polynomial.chebyshev import chebfit, chebval
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	10
				11
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	12	from .result_classes import NormStatProps, HistoStatProps, TimeSeries
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	13	from .utils import Number
koder aka kdanilov	bb6d6cd	2015-06-20 02:55:07 +0300	[diff] [blame]	14
				15
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	16	logger = logging.getLogger("wally")
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	17	DOUBLE_DELTA = 1e-8
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	18	MIN_VALUES_FOR_CONFIDENCE = 7
koder aka kdanilov	e87ae65	2015-04-20 02:14:35 +0300	[diff] [blame]	19
				20
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	21	average = numpy.mean
				22	dev = lambda x: math.sqrt(numpy.var(x, ddof=1))
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	23
				24
kdanylov aka koder	150b219	2017-04-01 16:53:01 +0300	[diff] [blame]	25	def calc_norm_stat_props(ts: TimeSeries, bins_count: int = None, confidence: float = 0.95) -> NormStatProps:
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	26	"Calculate statistical properties of array of numbers"
				27
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	28	# array.array has very basic support
				29	data = cast(List[int], ts.data)
kdanylov aka koder	4518318	2017-04-30 23:55:40 +0300	[diff] [blame]	30	res = NormStatProps(data, ts.units) # type: ignore
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	31
				32	if len(data) == 0:
				33	raise ValueError("Input array is empty")
				34
				35	data = sorted(data)
				36	res.average = average(data)
				37	res.deviation = dev(data)
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	38
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	39	res.max = data[-1]
				40	res.min = data[0]
				41
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	42	pcs = numpy.percentile(data, q=[1.0, 5.0, 10., 50., 90., 95., 99.])
				43	res.perc_1, res.perc_5, res.perc_10, res.perc_50, res.perc_90, res.perc_95, res.perc_99 = pcs
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	44
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	45	if len(data) >= MIN_VALUES_FOR_CONFIDENCE:
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	46	res.confidence = stats.sem(data) * \
				47	stats.t.ppf((1 + confidence) / 2, len(data) - 1)
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	48	res.confidence_level = confidence
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	49	else:
				50	res.confidence = None
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	51	res.confidence_level = None
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	52
kdanylov aka koder	150b219	2017-04-01 16:53:01 +0300	[diff] [blame]	53	if bins_count is not None:
				54	res.bins_populations, res.bins_edges = numpy.histogram(data, bins=bins_count)
				55	res.bins_edges = res.bins_edges[:-1]
koder aka kdanilov	ffaf48d	2016-12-27 02:25:29 +0200	[diff] [blame]	56
				57	try:
				58	res.normtest = stats.mstats.normaltest(data)
				59	except Exception as exc:
				60	logger.warning("stats.mstats.normaltest failed with error: %s", exc)
				61
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	62	res.skew = stats.skew(data)
				63	res.kurt = stats.kurtosis(data)
				64
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	65	return res
				66
				67
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	68	# update this code
				69	def rebin_histogram(bins_populations: numpy.array,
				70	bins_edges: numpy.array,
				71	new_bins_count: int,
				72	left_tail_idx: int = None,
				73	right_tail_idx: int = None,
				74	log_bins: bool = False) -> Tuple[numpy.array, numpy.array]:
				75	# rebin large histogram into smaller with new_bins bins, linearly distributes across
				76	# left_tail_idx:right_tail_idx range
				77
				78	assert len(bins_populations.shape) == 1
				79	assert len(bins_edges.shape) == 1
				80	assert bins_edges.shape[0] == bins_populations.shape[0]
				81
				82	if left_tail_idx is None:
				83	min_val = bins_edges[0]
				84	else:
				85	min_val = bins_edges[left_tail_idx]
				86
				87	if right_tail_idx is None:
				88	max_val = bins_edges[-1]
				89	else:
				90	max_val = bins_edges[right_tail_idx]
				91
				92	if log_bins:
				93	assert min_val > 1E-3
				94	step = (max_val / min_val) ** (1 / new_bins_count)
				95	new_bins_edges = min_val * (step ** numpy.arange(new_bins_count)) # type: numpy.array
				96	else:
				97	new_bins_edges = numpy.linspace(min_val, max_val, new_bins_count + 1, dtype='float')[:-1] # type: numpy.array
				98
				99	old_bins_pos = numpy.searchsorted(new_bins_edges, bins_edges, side='right')
				100	new_bins = numpy.zeros(new_bins_count, dtype=int) # type: numpy.array
				101
				102	# last source bin can't be split
				103	# TODO: need to add assert for this
				104	new_bins[-1] += bins_populations[-1]
				105	bin_sizes = bins_edges[1:] - bins_edges[:-1]
				106
				107	# correct position to get bin idx from edge idx
				108	old_bins_pos -= 1
				109	old_bins_pos[old_bins_pos < 0] = 0
				110	new_bins_sizes = new_bins_edges[1:] - new_bins_edges[:-1]
				111
				112	for population, begin, end, bsize in zip(bins_populations[:-1], old_bins_pos[:-1], old_bins_pos[1:], bin_sizes):
				113	if begin == end:
				114	new_bins[begin] += population
				115	else:
				116	density = population / bsize
				117	for curr_box in range(begin, end):
				118	cnt = min(int(new_bins_sizes[begin] * density + 0.5), population)
				119	new_bins[begin] += cnt
				120	population -= cnt
				121
				122	return new_bins, new_bins_edges
				123
				124
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	125	def calc_histo_stat_props(ts: TimeSeries,
kdanylov aka koder	cdfcdaf	2017-04-29 10:03:39 +0300	[diff] [blame]	126	bins_edges: numpy.array = None,
kdanylov aka koder	150b219	2017-04-01 16:53:01 +0300	[diff] [blame]	127	rebins_count: int = None,
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	128	tail: float = 0.005) -> HistoStatProps:
kdanylov aka koder	cdfcdaf	2017-04-29 10:03:39 +0300	[diff] [blame]	129	if bins_edges is None:
				130	bins_edges = ts.histo_bins
				131
kdanylov aka koder	4518318	2017-04-30 23:55:40 +0300	[diff] [blame]	132	res = HistoStatProps(ts.data, ts.units)
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	133
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	134	# summ across all series
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	135	aggregated = ts.data.sum(axis=0, dtype='int')
				136	total = aggregated.sum()
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	137
				138	# percentiles levels
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	139	expected = list(numpy.array([0.01, 0.05, 0.1, 0.5, 0.9, 0.95, 0.99]) * total)
				140	cumsum = numpy.cumsum(aggregated)
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	141
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	142	percentiles_bins = numpy.searchsorted(cumsum, expected)
				143	percentiles = bins_edges[percentiles_bins]
				144	res.perc_1, res.perc_5, res.perc_10, res.perc_50, res.perc_90, res.perc_95, res.perc_99 = percentiles
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	145
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	146	# don't show tail ranges on histogram
				147	left_tail_idx, right_tail_idx = numpy.searchsorted(cumsum, [tail * total, (1 - tail) * total])
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	148
				149	# minimax and maximal non-zero elements
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	150	non_zero = numpy.nonzero(aggregated)[0]
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	151	res.min = bins_edges[aggregated[non_zero[0]]]
				152	res.max = bins_edges[non_zero[-1] + (1 if non_zero[-1] != len(bins_edges) else 0)]
				153
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	154	res.log_bins = False
kdanylov aka koder	150b219	2017-04-01 16:53:01 +0300	[diff] [blame]	155	if rebins_count is not None:
				156	res.bins_populations, res.bins_edges = rebin_histogram(aggregated, bins_edges, rebins_count,
				157	left_tail_idx, right_tail_idx)
				158	else:
				159	res.bins_populations = aggregated
				160	res.bins_edges = bins_edges.copy()
koder aka kdanilov	108ac36	2017-01-19 20:17:16 +0200	[diff] [blame]	161
koder aka kdanilov	f286517	2016-12-30 03:35:11 +0200	[diff] [blame]	162	return res
				163
				164
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	165	def groupby_globally(data: Iterable, key_func: Callable):
				166	grouped = {} # type: ignore
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	167	grouped_iter = itertools.groupby(data, key_func)
				168
				169	for (bs, cache_tp, act, conc), curr_data_it in grouped_iter:
				170	key = (bs, cache_tp, act, conc)
				171	grouped.setdefault(key, []).extend(curr_data_it)
				172
				173	return grouped
				174
				175
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	176	def approximate_curve(x: List[Number], y: List[float], xnew: List[Number], curved_coef: int) -> List[float]:
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	177	"""returns ynew - y values of some curve approximation"""
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	178	return cast(List[float], chebval(xnew, chebfit(x, y, curved_coef)))
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	179
				180
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	181	def approximate_line(x: List[Number], y: List[float], xnew: List[Number], relative_dist: bool = False) -> List[float]:
				182	"""
				183	x, y - test data, xnew - dots, where we want find approximation
				184	if not relative_dist distance = y - newy
				185	returns ynew - y values of linear approximation
				186	"""
				187	ox = numpy.array(x)
				188	oy = numpy.array(y)
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	189
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	190	# set approximation function
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	191	def func_line(tpl, x):
				192	return tpl[0] * x + tpl[1]
				193
				194	def error_func_rel(tpl, x, y):
				195	return 1.0 - y / func_line(tpl, x)
				196
				197	def error_func_abs(tpl, x, y):
				198	return y - func_line(tpl, x)
				199
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	200	# choose distance mode
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	201	error_func = error_func_rel if relative_dist else error_func_abs
				202
				203	tpl_initial = tuple(linalg.solve([[ox[0], 1.0], [ox[1], 1.0]],
				204	oy[:2]))
				205
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	206	# find line
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	207	tpl_final, success = optimize.leastsq(error_func, tpl_initial[:], args=(ox, oy))
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	208
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	209	# if error
				210	if success not in range(1, 5):
				211	raise ValueError("No line for this dots")
koder aka kdanilov	66839a9	2015-04-11 13:22:31 +0300	[diff] [blame]	212
Ved-vampir	0316644	2015-04-10 17:28:23 +0300	[diff] [blame]	213	# return new dots
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	214	return func_line(tpl_final, numpy.array(xnew))
koder aka kdanilov	6c49106	2015-04-09 22:33:13 +0300	[diff] [blame]	215
				216
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	217	def moving_average(data: numpy.array, window: int) -> numpy.array:
				218	cumsum = numpy.cumsum(data)
				219	cumsum[window:] = cumsum[window:] - cumsum[:-window]
				220	return cumsum[window - 1:] / window
				221
				222
				223	def moving_dev(data: numpy.array, window: int) -> numpy.array:
				224	cumsum = numpy.cumsum(data)
				225	cumsum2 = numpy.cumsum(data ** 2)
				226	cumsum[window:] = cumsum[window:] - cumsum[:-window]
				227	cumsum2[window:] = cumsum2[window:] - cumsum2[:-window]
				228	return ((cumsum2[window - 1:] - cumsum[window - 1:] 2 / window) / (window - 1)) 0.5
				229
				230
				231	def find_ouliers(data: numpy.array,
				232	center_range: Tuple[int, int] = (25, 75),
kdanylov aka koder	0e0cfcb	2017-03-27 22:19:09 +0300	[diff] [blame]	233	cut_range: float = 3.0) -> numpy.array:
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	234	v1, v2 = numpy.percentile(data, center_range)
				235	return numpy.abs(data - (v1 + v2) / 2) > ((v2 - v1) / 2 * cut_range)
				236
				237
				238	def find_ouliers_ts(data: numpy.array,
				239	windows_size: int = 30,
				240	center_range: Tuple[int, int] = (25, 75),
kdanylov aka koder	0e0cfcb	2017-03-27 22:19:09 +0300	[diff] [blame]	241	cut_range: float = 3.0) -> numpy.array:
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	242	outliers = numpy.empty(data.shape, dtype=bool)
				243
				244	if len(data) < windows_size:
				245	outliers[:] = False
				246	return outliers
				247
				248	begin_idx = 0
				249	if len(data) < windows_size * 2:
				250	end_idx = (len(data) % windows_size) // 2 + windows_size
				251	else:
				252	end_idx = len(data)
				253
				254	while True:
				255	cdata = data[begin_idx: end_idx]
				256	outliers[begin_idx: end_idx] = find_ouliers(cdata, center_range, cut_range)
				257	begin_idx = end_idx
				258
				259	if end_idx == len(data):
				260	break
				261
				262	end_idx += windows_size
				263	if len(data) - end_idx < windows_size:
				264	end_idx = len(data)
				265
				266	return outliers
				267
				268
				269	def hist_outliers_nd(bin_populations: numpy.array,
				270	bin_centers: numpy.array,
				271	center_range: Tuple[int, int] = (25, 75),
				272	cut_range: float = 3.0) -> Tuple[int, int]:
				273	assert len(bin_populations) == len(bin_centers)
				274	total_count = bin_populations.sum()
				275
				276	perc25 = total_count / 100.0 * center_range[0]
				277	perc75 = total_count / 100.0 * center_range[1]
				278
				279	perc25_idx, perc75_idx = numpy.searchsorted(numpy.cumsum(bin_populations), [perc25, perc75])
				280	middle = (bin_centers[perc75_idx] + bin_centers[perc25_idx]) / 2
				281	r = (bin_centers[perc75_idx] - bin_centers[perc25_idx]) / 2
				282
				283	lower_bound = middle - r * cut_range
				284	upper_bound = middle + r * cut_range
				285
				286	lower_cut_idx, upper_cut_idx = numpy.searchsorted(bin_centers, [lower_bound, upper_bound])
				287	return lower_cut_idx, upper_cut_idx
				288
				289
				290	def hist_outliers_perc(bin_populations: numpy.array,
kdanylov aka koder	cdfcdaf	2017-04-29 10:03:39 +0300	[diff] [blame]	291	bounds_perc: Tuple[float, float] = (0.01, 0.99),
				292	min_bins_left: int = None) -> Tuple[int, int]:
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	293	assert len(bin_populations.shape) == 1
				294	total_count = bin_populations.sum()
				295	lower_perc = total_count * bounds_perc[0]
				296	upper_perc = total_count * bounds_perc[1]
kdanylov aka koder	cdfcdaf	2017-04-29 10:03:39 +0300	[diff] [blame]	297	idx1, idx2 = numpy.searchsorted(numpy.cumsum(bin_populations), [lower_perc, upper_perc])
				298
				299	# don't cut too many bins. At least min_bins_left must left
				300	if min_bins_left is not None and idx2 - idx1 < min_bins_left:
				301	missed = min_bins_left - (idx2 - idx1) // 2
				302	idx2 = min(len(bin_populations), idx2 + missed)
				303	idx1 = max(0, idx1 - missed)
				304
				305	return idx1, idx2
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	306
				307
				308	def ts_hist_outliers_perc(bin_populations: numpy.array,
				309	window_size: int = 10,
kdanylov aka koder	cdfcdaf	2017-04-29 10:03:39 +0300	[diff] [blame]	310	bounds_perc: Tuple[float, float] = (0.01, 0.99),
				311	min_bins_left: int = None) -> Tuple[int, int]:
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	312	assert len(bin_populations.shape) == 2
				313
				314	points = list(range(0, len(bin_populations), window_size))
				315	if len(bin_populations) % window_size != 0:
				316	points.append(points[-1] + window_size)
				317
kdanylov aka koder	cdfcdaf	2017-04-29 10:03:39 +0300	[diff] [blame]	318	ranges = [] # type: List[List[int]]
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	319	for begin, end in zip(points[:-1], points[1:]):
				320	window_hist = bin_populations[begin:end].sum(axis=0)
kdanylov aka koder	cdfcdaf	2017-04-29 10:03:39 +0300	[diff] [blame]	321	ranges.append(hist_outliers_perc(window_hist, bounds_perc=bounds_perc, min_bins_left=min_bins_left))
koder aka kdanilov	a732a60	2017-02-01 20:29:56 +0200	[diff] [blame]	322
				323	return min(i[0] for i in ranges), max(i[1] for i in ranges)
				324
				325
koder aka kdanilov	7f59d56	2016-12-26 01:34:23 +0200	[diff] [blame]	326	# TODO: revise next
				327	# def difference(y, ynew):
				328	# """returns average and maximum relative and
				329	# absolute differences between y and ynew
				330	# result may contain None values for y = 0
				331	# return value - tuple:
				332	# [(abs dif, rel dif) * len(y)],
				333	# (abs average, abs max),
				334	# (rel average, rel max)"""
				335	#
				336	# abs_dlist = []
				337	# rel_dlist = []
				338	#
				339	# for y1, y2 in zip(y, ynew):
				340	# # absolute
				341	# abs_dlist.append(y1 - y2)
				342	#
				343	# if y1 > 1E-6:
				344	# rel_dlist.append(abs(abs_dlist[-1] / y1))
				345	# else:
				346	# raise ZeroDivisionError("{0!r} is too small".format(y1))
				347	#
				348	# da_avg = sum(abs_dlist) / len(abs_dlist)
				349	# dr_avg = sum(rel_dlist) / len(rel_dlist)
				350	#
				351	# return (zip(abs_dlist, rel_dlist),
				352	# (da_avg, max(abs_dlist)), (dr_avg, max(rel_dlist))
				353	# )