a lot of changes
diff --git a/wally/suits/io/agent.py b/wally/suits/io/agent.py
index 51eb2fd..f6c3308 100644
--- a/wally/suits/io/agent.py
+++ b/wally/suits/io/agent.py
@@ -373,8 +373,10 @@
                          stdout=subprocess.PIPE,
                          stderr=subprocess.PIPE)
 
+    start_time = time.time()
     # set timeout
     raw_out, raw_err = p.communicate(benchmark_config)
+    end_time = time.time()
 
     # HACK
     raw_out = "{" + raw_out.split('{', 1)[1]
@@ -395,7 +397,7 @@
         raw_out = raw_out[:100]
         raise ValueError(msg.format(raw_out, exc))
 
-    return zip(parsed_out, config_slice)
+    return zip(parsed_out, config_slice), (start_time, end_time)
 
 
 def add_job_results(section, job_output, res):
@@ -445,13 +447,16 @@
     curr_test_num = 0
     executed_tests = 0
     result = {}
+    timings = []
 
     for i, test_slice in enumerate(sliced_list):
-        res_cfg_it = do_run_fio(test_slice)
+        res_cfg_it, slice_timings = do_run_fio(test_slice)
         res_cfg_it = enumerate(res_cfg_it, curr_test_num)
 
+        section_names = []
         for curr_test_num, (job_output, section) in res_cfg_it:
             executed_tests += 1
+            section_names.append(section.name)
 
             if raw_results_func is not None:
                 raw_results_func(executed_tests,
@@ -465,6 +470,7 @@
 
             add_job_results(section, job_output, result)
 
+        timings.append((section_names, slice_timings))
         curr_test_num += 1
         msg_template = "Done {0} tests from {1}. ETA: {2}"
 
@@ -475,7 +481,7 @@
                                   test_left,
                                   sec_to_str(time_eta))
 
-    return result, executed_tests
+    return result, executed_tests, timings
 
 
 def run_benchmark(binary_tp, *argv, **kwargs):
@@ -605,11 +611,13 @@
         rrfunc = raw_res_func if argv_obj.show_raw_results else None
 
         stime = time.time()
-        job_res, num_tests = run_benchmark(argv_obj.type,
-                                           sliced_it, rrfunc)
+        job_res, num_tests, timings = run_benchmark(argv_obj.type,
+                                                    sliced_it, rrfunc)
         etime = time.time()
 
-        res = {'__meta__': {'raw_cfg': job_cfg, 'params': params},
+        res = {'__meta__': {'raw_cfg': job_cfg,
+                            'params': params,
+                            'timings': timings},
                'res': job_res}
 
         oformat = 'json' if argv_obj.json else 'eval'
diff --git a/wally/suits/io/ceph.cfg b/wally/suits/io/ceph.cfg
index 425696a..5593181 100644
--- a/wally/suits/io/ceph.cfg
+++ b/wally/suits/io/ceph.cfg
@@ -12,8 +12,8 @@
 NUMJOBS_SHORT={% 1, 2, 3, 10 %}
 
 size=30G
-ramp_time=5
-runtime=30
+ramp_time=15
+runtime=60
 
 # ---------------------------------------------------------------------
 # check different thread count, sync mode. (latency, iops) = func(th_count)
diff --git a/wally/suits/io/formatter.py b/wally/suits/io/formatter.py
index 7fbe70b..09565be 100644
--- a/wally/suits/io/formatter.py
+++ b/wally/suits/io/formatter.py
@@ -1,33 +1,34 @@
 import texttable
 
-from wally.utils import ssize_to_b
+from wally.utils import ssize2b
+from wally.statistic import round_3_digit
 from wally.suits.io.agent import get_test_summary
-from wally.statistic import med_dev, round_deviation, round_3_digit
 
 
 def key_func(k_data):
-    _, data = k_data
-
+    name, data = k_data
     return (data['rw'],
             data['sync_mode'],
-            ssize_to_b(data['blocksize']),
-            data['concurence'])
+            ssize2b(data['blocksize']),
+            data['concurence'],
+            name)
 
 
-def format_results_for_console(test_set):
+def format_results_for_console(test_set, dinfo):
     """
     create a table with io performance report
     for console
     """
     tab = texttable.Texttable(max_width=120)
     tab.set_deco(tab.HEADER | tab.VLINES | tab.BORDER)
-    tab.set_cols_align(["l", "r", "r", "r", "r", "r", "r"])
+    tab.set_cols_align(["l", "l", "r", "r", "r", "r", "r", "r"])
+
+    items = sorted(test_set['res'].items(), key=key_func)
 
     prev_k = None
     vm_count = test_set['__test_meta__']['testnodes_count']
-    items = sorted(test_set['res'].items(), key=key_func)
-    header = ["Description", "iops\ncum", "KiBps\ncum",
-              "iops\nper vm", "KiBps\nper vm", "Cnf\n%", "lat\nms"]
+    header = ["Name", "Description", "iops\ncum", "KiBps\ncum",
+              "Cnf\n95%", "iops\nper vm", "KiBps\nper vm", "lat\nms"]
 
     for test_name, data in items:
 
@@ -36,34 +37,30 @@
         if prev_k is not None:
             if prev_k != curr_k:
                 tab.add_row(
-                    ["--------", "-----", "------",
-                     "-----", "------", "---", "-----"])
+                    ["-------", "--------", "-----", "------",
+                     "---", "------", "---", "-----"])
 
         prev_k = curr_k
 
         descr = get_test_summary(data)
+        test_dinfo = dinfo[test_name]
 
-        iops, _ = round_deviation(med_dev(data['iops']))
-        bw, bwdev = round_deviation(med_dev(data['bw']))
+        iops, _ = test_dinfo.iops.rounded_average_conf()
+        bw, bw_conf = test_dinfo.bw.rounded_average_conf()
+        conf_perc = int(round(bw_conf * 100 / bw))
 
-        # 3 * sigma
-        if 0 == bw:
-            assert 0 == bwdev
-            dev_perc = 0
-        else:
-            dev_perc = int((bwdev * 300) / bw)
+        lat, _ = test_dinfo.lat.rounded_average_conf()
+        lat = round_3_digit(int(lat) // 1000)
 
-        med_lat, _ = round_deviation(med_dev(data['lat']))
-        med_lat = int(med_lat) // 1000
+        iops_per_vm = round_3_digit(iops / float(vm_count))
+        bw_per_vm = round_3_digit(bw / float(vm_count))
 
         iops = round_3_digit(iops)
         bw = round_3_digit(bw)
-        iops_cum = round_3_digit(iops * vm_count)
-        bw_cum = round_3_digit(bw * vm_count)
-        med_lat = round_3_digit(med_lat)
 
-        params = (descr, int(iops_cum), int(bw_cum),
-                  int(iops), int(bw), dev_perc, med_lat)
+        params = (test_name.split('_', 1)[0],
+                  descr, int(iops), int(bw), str(conf_perc),
+                  int(iops_per_vm), int(bw_per_vm), lat)
         tab.add_row(params)
 
     tab.header(header)
diff --git a/wally/suits/io/results_loader.py b/wally/suits/io/results_loader.py
index 4dff186..988fe0e 100644
--- a/wally/suits/io/results_loader.py
+++ b/wally/suits/io/results_loader.py
@@ -1,47 +1,5 @@
 import re
 import json
-import collections
-
-
-# from wally.utils import ssize_to_b
-from wally.statistic import med_dev
-
-PerfInfo = collections.namedtuple('PerfInfo',
-                                  ('name',
-                                   'bw', 'iops', 'dev',
-                                   'lat', 'lat_dev', 'raw',
-                                   'meta'))
-
-
-def split_and_add(data, block_size):
-    assert len(data) % block_size == 0
-    res = [0] * block_size
-
-    for idx, val in enumerate(data):
-        res[idx % block_size] += val
-
-    return res
-
-
-def process_disk_info(test_output):
-    data = {}
-    for tp, pre_result in test_output:
-        if tp != 'io' or pre_result is None:
-            pass
-
-        vm_count = pre_result['__test_meta__']['testnodes_count']
-        for name, results in pre_result['res'].items():
-            assert len(results['bw']) % vm_count == 0
-            block_count = len(results['bw']) // vm_count
-
-            bw, bw_dev = med_dev(split_and_add(results['bw'], block_count))
-            iops, _ = med_dev(split_and_add(results['iops'],
-                                            block_count))
-            lat, lat_dev = med_dev(results['lat'])
-            dev = bw_dev / float(bw)
-            data[name] = PerfInfo(name, bw, iops, dev, lat, lat_dev, results,
-                                  pre_result['__test_meta__'])
-    return data
 
 
 def parse_output(out_err):
@@ -96,21 +54,3 @@
             else:
                 yield map(result.raw.get, fields_to_select)
     return closure
-
-
-# def load_data(raw_data):
-#     data = list(parse_output(raw_data))[0]
-
-#     for key, val in data['res'].items():
-#         val['blocksize_b'] = ssize_to_b(val['blocksize'])
-
-#         val['iops_mediana'], val['iops_stddev'] = med_dev(val['iops'])
-#         val['bw_mediana'], val['bw_stddev'] = med_dev(val['bw'])
-#         val['lat_mediana'], val['lat_stddev'] = med_dev(val['lat'])
-#         yield val
-
-
-# def load_files(*fnames):
-#     for fname in fnames:
-#         for i in load_data(open(fname).read()):
-#             yield i
diff --git a/wally/suits/io/rrd.cfg b/wally/suits/io/rrd.cfg
new file mode 100644
index 0000000..5593181
--- /dev/null
+++ b/wally/suits/io/rrd.cfg
@@ -0,0 +1,55 @@
+[defaults]
+wait_for_previous
+group_reporting
+time_based
+buffered=0
+iodepth=1
+softrandommap=1
+filename={FILENAME}
+NUM_ROUNDS=7
+
+NUMJOBS={% 1, 5, 10, 15, 40 %}
+NUMJOBS_SHORT={% 1, 2, 3, 10 %}
+
+size=30G
+ramp_time=15
+runtime=60
+
+# ---------------------------------------------------------------------
+# check different thread count, sync mode. (latency, iops) = func(th_count)
+# ---------------------------------------------------------------------
+[ceph_test_{TEST_SUMM} * {NUM_ROUNDS}]
+blocksize=4k
+rw=randwrite
+sync=1
+numjobs={NUMJOBS}
+
+# ---------------------------------------------------------------------
+# direct write
+# ---------------------------------------------------------------------
+[ceph_test_{TEST_SUMM} * {NUM_ROUNDS}]
+blocksize=4k
+rw=randwrite
+direct=1
+numjobs=1
+
+# ---------------------------------------------------------------------
+# check different thread count, direct read mode. (latency, iops) = func(th_count)
+# also check iops for randread
+# ---------------------------------------------------------------------
+[ceph_test_{TEST_SUMM} * {NUM_ROUNDS}]
+blocksize=4k
+rw=randread
+direct=1
+numjobs={NUMJOBS}
+
+# ---------------------------------------------------------------------
+# this is essentially sequential write/read operations
+# we can't use sequential with numjobs > 1 due to caching and block merging
+# ---------------------------------------------------------------------
+[ceph_test_{TEST_SUMM} * {NUM_ROUNDS}]
+blocksize=16m
+rw={% randread, randwrite %}
+direct=1
+numjobs={NUMJOBS_SHORT}
+
diff --git a/wally/suits/io/verify.cfg b/wally/suits/io/verify.cfg
new file mode 100644
index 0000000..4a66aac
--- /dev/null
+++ b/wally/suits/io/verify.cfg
@@ -0,0 +1,38 @@
+[defaults]
+wait_for_previous
+group_reporting
+time_based
+buffered=0
+iodepth=1
+softrandommap=1
+filename={FILENAME}
+NUM_ROUNDS=1
+
+size=5G
+ramp_time=5
+runtime=360
+
+# ---------------------------------------------------------------------
+# check different thread count, sync mode. (latency, iops) = func(th_count)
+# ---------------------------------------------------------------------
+[verify_{TEST_SUMM}]
+blocksize=4m
+rw=randread
+direct=1
+numjobs=5
+
+# ---------------------------------------------------------------------
+# check different thread count, sync mode. (latency, iops) = func(th_count)
+# ---------------------------------------------------------------------
+# [verify_{TEST_SUMM}]
+# blocksize=4k
+# rw=randwrite
+# direct=1
+
+# ---------------------------------------------------------------------
+# direct write
+# ---------------------------------------------------------------------
+# [verify_{TEST_SUMM}]
+# blocksize=4k
+# rw=randread
+# direct=1