fixes, fixes, fixes
diff --git a/wally/suits/io/agent.py b/wally/suits/io/agent.py
index 5fb835e..336b176 100644
--- a/wally/suits/io/agent.py
+++ b/wally/suits/io/agent.py
@@ -428,52 +428,41 @@
def run_fio(sliced_it, raw_results_func=None):
sliced_list = list(sliced_it)
- ok = True
- try:
- curr_test_num = 0
- executed_tests = 0
- result = {}
+ curr_test_num = 0
+ executed_tests = 0
+ result = {}
- for i, test_slice in enumerate(sliced_list):
- res_cfg_it = do_run_fio(test_slice)
- res_cfg_it = enumerate(res_cfg_it, curr_test_num)
+ for i, test_slice in enumerate(sliced_list):
+ res_cfg_it = do_run_fio(test_slice)
+ res_cfg_it = enumerate(res_cfg_it, curr_test_num)
- for curr_test_num, (job_output, section) in res_cfg_it:
- executed_tests += 1
+ for curr_test_num, (job_output, section) in res_cfg_it:
+ executed_tests += 1
- if raw_results_func is not None:
- raw_results_func(executed_tests,
- [job_output, section])
+ if raw_results_func is not None:
+ raw_results_func(executed_tests,
+ [job_output, section])
- msg = "{0} != {1}".format(section.name, job_output["jobname"])
- assert section.name == job_output["jobname"], msg
+ msg = "{0} != {1}".format(section.name, job_output["jobname"])
+ assert section.name == job_output["jobname"], msg
- if section.name.startswith('_'):
- continue
+ if section.name.startswith('_'):
+ continue
- add_job_results(section, job_output, result)
+ add_job_results(section, job_output, result)
- curr_test_num += 1
- msg_template = "Done {0} tests from {1}. ETA: {2}"
+ curr_test_num += 1
+ msg_template = "Done {0} tests from {1}. ETA: {2}"
- rest = sliced_list[i:]
- time_eta = sum(map(calculate_execution_time, rest))
- test_left = sum(map(len, rest))
- print msg_template.format(curr_test_num,
- test_left,
- sec_to_str(time_eta))
+ rest = sliced_list[i:]
+ time_eta = sum(map(calculate_execution_time, rest))
+ test_left = sum(map(len, rest))
+ print msg_template.format(curr_test_num,
+ test_left,
+ sec_to_str(time_eta))
- except (SystemExit, KeyboardInterrupt):
- raise
-
- except Exception:
- print "=========== ERROR ============="
- traceback.print_exc()
- print "======== END OF ERROR ========="
- ok = False
-
- return result, executed_tests, ok
+ return result, executed_tests
def run_benchmark(binary_tp, *argv, **kwargs):
@@ -603,8 +592,8 @@
rrfunc = raw_res_func if argv_obj.show_raw_results else None
stime = time.time()
- job_res, num_tests, ok = run_benchmark(argv_obj.type,
- sliced_it, rrfunc)
+ job_res, num_tests = run_benchmark(argv_obj.type,
+ sliced_it, rrfunc)
etime = time.time()
res = {'__meta__': {'raw_cfg': job_cfg, 'params': params},
@@ -622,8 +611,21 @@
out_fd.write(pprint.pformat(res) + "\n")
out_fd.write("\n========= END OF RESULTS =========\n")
- return 0 if ok else 1
+ return 0
+ except:
+ out_fd.write("============ ERROR =============\n")
+ out_fd.write(traceback.format_exc() + "\n")
+ out_fd.write("============ END OF ERROR =============\n")
+ return 1
finally:
+ try:
+ if out_fd is not sys.stdout:
+ out_fd.flush()
+ os.fsync(out_fd)
+ out_fd.close()
+ except Exception:
+ traceback.print_exc()
+
if argv_obj.pid_file is not None:
if os.path.exists(argv_obj.pid_file):
os.unlink(argv_obj.pid_file)
diff --git a/wally/suits/io/results_loader.py b/wally/suits/io/results_loader.py
index 9005450..3c8d9c5 100644
--- a/wally/suits/io/results_loader.py
+++ b/wally/suits/io/results_loader.py
@@ -29,6 +29,14 @@
def parse_output(out_err):
+ err_start_patt = r"(?ims)=+\s+ERROR\s+=+"
+ err_end_patt = r"(?ims)=+\s+END OF ERROR\s+=+"
+
+ for block in re.split(err_start_patt, out_err)[1:]:
+ tb, garbage = re.split(err_end_patt, block)
+ msg = "Test fails with error:\n" + tb.strip() + "\n"
+ raise OSError(msg)
+
start_patt = r"(?ims)=+\s+RESULTS\(format=json\)\s+=+"
end_patt = r"(?ims)=+\s+END OF RESULTS\s+=+"
diff --git a/wally/suits/itest.py b/wally/suits/itest.py
index 294909b..ad5e8a5 100644
--- a/wally/suits/itest.py
+++ b/wally/suits/itest.py
@@ -120,6 +120,8 @@
class IOPerfTest(IPerfTest):
+ tcp_conn_timeout = 30
+ max_pig_timeout = 30
def __init__(self, *dt, **mp):
IPerfTest.__init__(self, *dt, **mp)
@@ -236,6 +238,67 @@
elif self.is_primary:
logger.warning("Prefilling of test files is disabled")
+ def get_test_status(self):
+ is_connected = None
+ has_pid_file = None
+ pid = None
+ err = None
+
+ try:
+ conn = connect(self.node.conn_url,
+ conn_timeout=self.tcp_conn_timeout)
+ with conn:
+ with conn.open_sftp() as sftp:
+ try:
+ pid = read_from_remote(sftp, self.pid_file)
+ has_pid_file = True
+ except (NameError, IOError) as exc:
+ pid = None
+ has_pid_file = False
+
+ is_connected = True
+
+ except (socket.error, SSHException, EOFError) as exc:
+ err = str(exc)
+ is_connected = False
+
+ return is_connected, has_pid_file, pid, err
+
+ def wait_till_finished(self, timeout):
+ conn_id = self.node.get_conn_id()
+ end_of_wait_time = timeout + time.time()
+
+ # time_till_check = random.randint(30, 90)
+ time_till_check = 5
+ pid = None
+ has_pid_file = False
+ pid_get_timeout = self.max_pig_timeout + time.time()
+ curr_connected = True
+
+ while end_of_wait_time > time.time():
+ time.sleep(time_till_check)
+
+ is_connected, has_pid_file, pid, err = self.get_test_status()
+
+ if not has_pid_file:
+ if pid is None and time.time() > pid_get_timeout:
+ msg = ("On node {0} pid file doesn't " +
+ "appears in time")
+ logger.error(msg.format(conn_id))
+ raise RuntimeError("Start timeout")
+ else:
+ # execution finished
+ break
+
+ if is_connected and not curr_connected:
+ msg = "Connection with {0} is restored"
+ logger.debug(msg.format(conn_id))
+ elif not is_connected and curr_connected:
+ msg = "Lost connection with " + conn_id + ". Error: " + err
+ logger.debug(msg)
+
+ curr_connected = is_connected
+
def run(self, barrier):
conn_id = self.node.get_conn_id()
@@ -286,80 +349,26 @@
wait_till.strftime("%H:%M:%S")))
self.run_over_ssh(cmd)
+
msg = "Test on node {0} started in screen {1}"
logger.debug(msg.format(conn_id, screen_name))
- end_of_wait_time = timeout + time.time()
-
- # time_till_check = random.randint(30, 90)
- time_till_check = 5
-
- pid = None
- no_pid_file = True
- tcp_conn_timeout = 30
- pid_get_timeout = 30 + time.time()
- connection_ok = True
-
# TODO: add monitoring socket
if self.node.connection is not Local:
self.node.connection.close()
- while end_of_wait_time > time.time():
- conn = None
- time.sleep(time_till_check)
-
- try:
- if self.node.connection is not Local:
- conn = connect(self.node.conn_url,
- conn_timeout=tcp_conn_timeout)
- else:
- conn = self.node.connection
-
- try:
- with conn.open_sftp() as sftp:
- try:
- pid = read_from_remote(sftp, self.pid_file)
- no_pid_file = False
- except (NameError, IOError):
- no_pid_file = True
- finally:
- if conn is not Local:
- conn.close()
- conn = None
-
- if no_pid_file:
- if pid is None:
- if time.time() > pid_get_timeout:
- msg = ("On node {0} pid file doesn't " +
- "appears in time")
- logger.error(msg.format(conn_id))
- raise RuntimeError("Start timeout")
- else:
- # execution finished
- break
- if not connection_ok:
- msg = "Connection with {0} is restored"
- logger.debug(msg.format(conn_id))
- connection_ok = True
-
- except (socket.error, SSHException, EOFError) as exc:
- if connection_ok:
- connection_ok = False
- msg = "Lost connection with " + conn_id
- msg += ". Error: " + str(exc)
- logger.debug(msg)
-
+ self.wait_till_finished(timeout)
logger.debug("Done")
if self.node.connection is not Local:
- timeout = tcp_conn_timeout * 3
+ conn_timeout = self.tcp_conn_timeout * 3
self.node.connection = connect(self.node.conn_url,
- conn_timeout=timeout)
+ conn_timeout=conn_timeout)
with self.node.connection.open_sftp() as sftp:
# try to reboot and then connect
- out_err = read_from_remote(sftp,
- self.log_fl)
+ out_err = read_from_remote(sftp, self.log_fl)
+
finally:
barrier.exit()
@@ -369,6 +378,8 @@
try:
for data in parse_output(out_err):
self.on_result_cb(data)
+ except OSError:
+ raise
except Exception as exc:
msg_templ = "Error during postprocessing results: {0!s}"
raise RuntimeError(msg_templ.format(exc))