working on reporting, this commit represent broking code state

commit: 108ac363ea39b7ec8ad075e374d5e27f0190d376 [log] [tgz]
author: koder aka kdanilov <kdanilov@mirantis.com> Thu Jan 19 20:17:16 2017 +0200
committer: koder aka kdanilov <kdanilov@mirantis.com> Thu Jan 19 20:17:16 2017 +0200
tree: 4c50c55ac0280d004303793ade0de64270edc3fd
parent: f286517ad5e59b92dc6177e217880b7decb1d578 [diff] [blame]
diff --git a/wally/statistic.py b/wally/statistic.py
index 259ac69..b80fb22 100644
--- a/wally/statistic.py
+++ b/wally/statistic.py

@@ -1,5 +1,4 @@
 import math
-import array
 import logging
 import itertools
 import statistics
@@ -17,17 +16,19 @@
 
 logger = logging.getLogger("wally")
 DOUBLE_DELTA = 1e-8
+MIN_VALUES_FOR_CONFIDENCE = 7
 
 
-average = statistics.mean
-dev = lambda x: math.sqrt(statistics.variance(x))
+average = numpy.mean
+dev = lambda x: math.sqrt(numpy.var(x, ddof=1))
 
 
-def calc_norm_stat_props(ts: TimeSeries, confidence: float = 0.95) -> NormStatProps:
+def calc_norm_stat_props(ts: TimeSeries, bins_count: int, confidence: float = 0.95) -> NormStatProps:
     "Calculate statistical properties of array of numbers"
 
-    data = ts.data
-    res = NormStatProps(data)
+    # array.array has very basic support
+    data = cast(List[int], ts.data)
+    res = NormStatProps(data)  # type: ignore
 
     if len(data) == 0:
         raise ValueError("Input array is empty")
@@ -39,57 +40,104 @@
     res.max = data[-1]
     res.min = data[0]
 
-    res.perc_50 = numpy.percentile(data, 50)
-    res.perc_90 = numpy.percentile(data, 90)
-    res.perc_95 = numpy.percentile(data, 95)
-    res.perc_99 = numpy.percentile(data, 99)
+    res.perc_50, res.perc_90, res.perc_99, res.perc_99 = numpy.percentile(data, q=[50., 90., 95., 99.])
 
-    if len(data) >= 3:
+    if len(data) >= MIN_VALUES_FOR_CONFIDENCE:
         res.confidence = stats.sem(data) * \
                          stats.t.ppf((1 + confidence) / 2, len(data) - 1)
+        res.confidence_level = confidence
     else:
         res.confidence = None
+        res.confidence_level = None
 
-    res.bin_populations, res.bin_edges = numpy.histogram(data, 'auto')
+    res.bins_populations, bins_edges = numpy.histogram(data, bins=bins_count)
+    res.bins_mids = (bins_edges[:-1] + bins_edges[1:]) / 2
 
     try:
         res.normtest = stats.mstats.normaltest(data)
     except Exception as exc:
         logger.warning("stats.mstats.normaltest failed with error: %s", exc)
 
+    res.skew = stats.skew(data)
+    res.kurt = stats.kurtosis(data)
+
     return res
 
 
-def calc_histo_stat_props(ts: TimeSeries) -> HistoStatProps:
-    data = numpy.array(ts.data)
-    data.shape = [len(ts.data) // ts.second_axis_size, ts.second_axis_size]
+def calc_histo_stat_props(ts: TimeSeries,
+                          bins_edges: numpy.array,
+                          bins_count: int,
+                          min_valuable: float = 0.0001) -> HistoStatProps:
+    data = numpy.array(ts.data, dtype='int')
+    data.shape = [len(ts.data) // ts.second_axis_size, ts.second_axis_size]  # type: ignore
 
     res = HistoStatProps(ts.data, ts.second_axis_size)
-    aggregated = numpy.sum(data, axis=0)
 
-    full_sum = numpy.sum(aggregated)
-    expected = [full_sum * 0.5, full_sum * 0.9, full_sum * 0.95, full_sum * 0.99]
+    # summ across all series
+    aggregated = numpy.sum(data, axis=0, dtype='int')
+    total = numpy.sum(aggregated)
+
+    # minimal value used for histo
+    min_val_on_histo = total * min_valuable
+
+    # percentiles levels
+    expected = [total * 0.5, total * 0.9, total * 0.95, total * 0.99]
     percentiles = []
 
-    val_min = None
-    val_max = None
+    # all indexes, where values greater than min_val_on_histo
+    valuable_idxs = []
 
-    for idx, val in enumerate(aggregated):
-        while expected and full_sum + val >= expected[0]:
-            percentiles.append(idx)
+    curr_summ = 0
+    non_zero = aggregated.nonzero()[0]
+
+    # calculate percentiles and valuable_indexes
+    for idx in non_zero:
+        val = aggregated[idx]
+        while expected and curr_summ + val >= expected[0]:
+            percentiles.append(bins_edges[idx])
             del expected[0]
 
-        full_sum += val
+        curr_summ += val
 
-        if val != 0:
-            if val_min is None:
-                val_min = idx
-            val_max = idx
+        if val >= min_val_on_histo:
+            valuable_idxs.append(idx)
 
-    res.perc_50, res.perc_90, res.perc_95, res.perc_99 = map(ts.bins_edges.__getitem__, percentiles)
-    res.min = ts.bins_edges[val_min]
-    res.max = ts.bins_edges[val_max]
-    res.bin_populations = aggregated
+    res.perc_50, res.perc_90, res.perc_95, res.perc_99 = percentiles
+
+    # minimax and maximal non-zero elements
+    res.min = bins_edges[aggregated[non_zero[0]]]
+    res.max = bins_edges[non_zero[-1] + (1 if non_zero[-1] != len(bins_edges) else 0)]
+
+    # minimal and maximal valueble evelemts
+    val_idx_min = valuable_idxs[0]
+    val_idx_max = valuable_idxs[-1]
+
+    raw_bins_populations = aggregated[val_idx_min: val_idx_max + 1]
+    raw_bins_edges = bins_edges[val_idx_min: val_idx_max + 2]
+    raw_bins_mids = cast(numpy.array, (raw_bins_edges[1:] + raw_bins_edges[:-1]) / 2)
+
+    step = (raw_bins_mids[-1] + raw_bins_mids[0]) / bins_count
+    next = raw_bins_mids[0]
+
+    # aggregate raw histogram with many bins into result histogram with bins_count bins
+    cidx = 0
+    bins_populations = []
+    bins_mids = []
+
+    while cidx < len(raw_bins_mids):
+        next += step
+        bin_population = 0
+
+        while cidx < len(raw_bins_mids) and raw_bins_mids[cidx] <= next:
+            bin_population += raw_bins_populations[cidx]
+            cidx += 1
+
+        bins_populations.append(bin_population)
+        bins_mids.append(next - step / 2)
+
+    res.bins_populations = numpy.array(bins_populations, dtype='int')
+    res.bins_mids = numpy.array(bins_mids, dtype='float32')
+
     return res
commit	108ac363ea39b7ec8ad075e374d5e27f0190d376	[log] [tgz]
author	koder aka kdanilov <kdanilov@mirantis.com>	Thu Jan 19 20:17:16 2017 +0200
committer	koder aka kdanilov <kdanilov@mirantis.com>	Thu Jan 19 20:17:16 2017 +0200
tree	4c50c55ac0280d004303793ade0de64270edc3fd
parent	f286517ad5e59b92dc6177e217880b7decb1d578 [diff] [blame]