working on reporting, this commit represent broking code state
diff --git a/wally/statistic.py b/wally/statistic.py
index 259ac69..b80fb22 100644
--- a/wally/statistic.py
+++ b/wally/statistic.py
@@ -1,5 +1,4 @@
import math
-import array
import logging
import itertools
import statistics
@@ -17,17 +16,19 @@
logger = logging.getLogger("wally")
DOUBLE_DELTA = 1e-8
+MIN_VALUES_FOR_CONFIDENCE = 7
-average = statistics.mean
-dev = lambda x: math.sqrt(statistics.variance(x))
+average = numpy.mean
+dev = lambda x: math.sqrt(numpy.var(x, ddof=1))
-def calc_norm_stat_props(ts: TimeSeries, confidence: float = 0.95) -> NormStatProps:
+def calc_norm_stat_props(ts: TimeSeries, bins_count: int, confidence: float = 0.95) -> NormStatProps:
"Calculate statistical properties of array of numbers"
- data = ts.data
- res = NormStatProps(data)
+ # array.array has very basic support
+ data = cast(List[int], ts.data)
+ res = NormStatProps(data) # type: ignore
if len(data) == 0:
raise ValueError("Input array is empty")
@@ -39,57 +40,104 @@
res.max = data[-1]
res.min = data[0]
- res.perc_50 = numpy.percentile(data, 50)
- res.perc_90 = numpy.percentile(data, 90)
- res.perc_95 = numpy.percentile(data, 95)
- res.perc_99 = numpy.percentile(data, 99)
+ res.perc_50, res.perc_90, res.perc_99, res.perc_99 = numpy.percentile(data, q=[50., 90., 95., 99.])
- if len(data) >= 3:
+ if len(data) >= MIN_VALUES_FOR_CONFIDENCE:
res.confidence = stats.sem(data) * \
stats.t.ppf((1 + confidence) / 2, len(data) - 1)
+ res.confidence_level = confidence
else:
res.confidence = None
+ res.confidence_level = None
- res.bin_populations, res.bin_edges = numpy.histogram(data, 'auto')
+ res.bins_populations, bins_edges = numpy.histogram(data, bins=bins_count)
+ res.bins_mids = (bins_edges[:-1] + bins_edges[1:]) / 2
try:
res.normtest = stats.mstats.normaltest(data)
except Exception as exc:
logger.warning("stats.mstats.normaltest failed with error: %s", exc)
+ res.skew = stats.skew(data)
+ res.kurt = stats.kurtosis(data)
+
return res
-def calc_histo_stat_props(ts: TimeSeries) -> HistoStatProps:
- data = numpy.array(ts.data)
- data.shape = [len(ts.data) // ts.second_axis_size, ts.second_axis_size]
+def calc_histo_stat_props(ts: TimeSeries,
+ bins_edges: numpy.array,
+ bins_count: int,
+ min_valuable: float = 0.0001) -> HistoStatProps:
+ data = numpy.array(ts.data, dtype='int')
+ data.shape = [len(ts.data) // ts.second_axis_size, ts.second_axis_size] # type: ignore
res = HistoStatProps(ts.data, ts.second_axis_size)
- aggregated = numpy.sum(data, axis=0)
- full_sum = numpy.sum(aggregated)
- expected = [full_sum * 0.5, full_sum * 0.9, full_sum * 0.95, full_sum * 0.99]
+ # summ across all series
+ aggregated = numpy.sum(data, axis=0, dtype='int')
+ total = numpy.sum(aggregated)
+
+ # minimal value used for histo
+ min_val_on_histo = total * min_valuable
+
+ # percentiles levels
+ expected = [total * 0.5, total * 0.9, total * 0.95, total * 0.99]
percentiles = []
- val_min = None
- val_max = None
+ # all indexes, where values greater than min_val_on_histo
+ valuable_idxs = []
- for idx, val in enumerate(aggregated):
- while expected and full_sum + val >= expected[0]:
- percentiles.append(idx)
+ curr_summ = 0
+ non_zero = aggregated.nonzero()[0]
+
+ # calculate percentiles and valuable_indexes
+ for idx in non_zero:
+ val = aggregated[idx]
+ while expected and curr_summ + val >= expected[0]:
+ percentiles.append(bins_edges[idx])
del expected[0]
- full_sum += val
+ curr_summ += val
- if val != 0:
- if val_min is None:
- val_min = idx
- val_max = idx
+ if val >= min_val_on_histo:
+ valuable_idxs.append(idx)
- res.perc_50, res.perc_90, res.perc_95, res.perc_99 = map(ts.bins_edges.__getitem__, percentiles)
- res.min = ts.bins_edges[val_min]
- res.max = ts.bins_edges[val_max]
- res.bin_populations = aggregated
+ res.perc_50, res.perc_90, res.perc_95, res.perc_99 = percentiles
+
+ # minimax and maximal non-zero elements
+ res.min = bins_edges[aggregated[non_zero[0]]]
+ res.max = bins_edges[non_zero[-1] + (1 if non_zero[-1] != len(bins_edges) else 0)]
+
+ # minimal and maximal valueble evelemts
+ val_idx_min = valuable_idxs[0]
+ val_idx_max = valuable_idxs[-1]
+
+ raw_bins_populations = aggregated[val_idx_min: val_idx_max + 1]
+ raw_bins_edges = bins_edges[val_idx_min: val_idx_max + 2]
+ raw_bins_mids = cast(numpy.array, (raw_bins_edges[1:] + raw_bins_edges[:-1]) / 2)
+
+ step = (raw_bins_mids[-1] + raw_bins_mids[0]) / bins_count
+ next = raw_bins_mids[0]
+
+ # aggregate raw histogram with many bins into result histogram with bins_count bins
+ cidx = 0
+ bins_populations = []
+ bins_mids = []
+
+ while cidx < len(raw_bins_mids):
+ next += step
+ bin_population = 0
+
+ while cidx < len(raw_bins_mids) and raw_bins_mids[cidx] <= next:
+ bin_population += raw_bins_populations[cidx]
+ cidx += 1
+
+ bins_populations.append(bin_population)
+ bins_mids.append(next - step / 2)
+
+ res.bins_populations = numpy.array(bins_populations, dtype='int')
+ res.bins_mids = numpy.array(bins_mids, dtype='float32')
+
return res