triangle strange bug
diff --git a/wally/ssh_utils.py b/wally/ssh_utils.py
index fa33d46..0f46b83 100644
--- a/wally/ssh_utils.py
+++ b/wally/ssh_utils.py
@@ -18,7 +18,7 @@
class Local(object):
- "placeholder for local node"
+ "simulate ssh connection to local"
@classmethod
def open_sftp(cls):
return cls()
@@ -382,6 +382,8 @@
class BGSSHTask(object):
+ CHECK_RETRY = 5
+
def __init__(self, node, use_sudo):
self.node = node
self.pid = None
@@ -395,15 +397,22 @@
**params)
processes = run_over_ssh(self.node.connection, "ps aux", nolog=True)
- for proc in processes.split("\n"):
- if orig_cmd in proc and "SCREEN" not in proc:
- self.pid = proc.split()[1]
+ for iter in range(self.CHECK_RETRY):
+ for proc in processes.split("\n"):
+ if orig_cmd in proc and "SCREEN" not in proc:
+ self.pid = proc.split()[1]
+ break
+ if self.pid is not None:
break
- else:
+ time.sleep(1)
+
+ if self.pid is None:
self.pid = -1
def check_running(self):
assert self.pid is not None
+ if -1 == self.pid:
+ return False
try:
run_over_ssh(self.node.connection,
"ls /proc/{0}".format(self.pid),
@@ -414,6 +423,8 @@
def kill(self, soft=True, use_sudo=True):
assert self.pid is not None
+ if self.pid == -1:
+ return True
try:
if soft:
cmd = "kill {0}"
@@ -443,7 +454,8 @@
return True
while self.check_running() and time.time() < soft_end_of_wait_time:
- time.sleep(soft_end_of_wait_time - time.time())
+ # time.sleep(soft_end_of_wait_time - time.time())
+ time.sleep(2)
while end_of_wait_time > time.time():
time.sleep(time_till_check)
diff --git a/wally/start_vms.py b/wally/start_vms.py
index 9da66f4..b5eb17e 100644
--- a/wally/start_vms.py
+++ b/wally/start_vms.py
@@ -4,6 +4,7 @@
import time
import os.path
import logging
+import warnings
import subprocess
import collections
@@ -118,8 +119,19 @@
spath = os.path.dirname(os.path.dirname(wally.__file__))
spath = os.path.join(spath, 'scripts/prepare.sh')
- cmd = "bash {spath} >/dev/null 2>&1".format(spath=spath)
- subprocess.check_call(cmd, shell=True, env=env)
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ fname = os.tempnam()
+
+ cmd = "bash {spath} >{fname} 2>&1".format(spath=spath, fname=fname)
+ try:
+ subprocess.check_call(cmd, shell=True, env=env)
+ except:
+ logger.error("Prepare failed. Logs in " + fname)
+ with open(fname) as fd:
+ logger.error("Message:\n " + fd.read().replace("\n", "\n "))
+ raise
+ os.unlink(fname)
while True:
status = nova.images.find(name=image_name).status
diff --git a/wally/suits/io/fio.py b/wally/suits/io/fio.py
index 275c1ed..1a3e846 100644
--- a/wally/suits/io/fio.py
+++ b/wally/suits/io/fio.py
@@ -3,6 +3,7 @@
import json
import stat
import random
+import shutil
import os.path
import logging
import datetime
@@ -668,7 +669,8 @@
for idx in range(max_retr):
try:
intervals = list(pool.map(func, self.config.nodes))
- break
+ if None not in intervals:
+ break
except (EnvironmentError, SSHException) as exc:
logger.exception("During fio run")
if idx == max_retr - 1:
@@ -814,6 +816,10 @@
arch_name = self.join_remote('wally_result.tar.gz')
tmp_dir = os.path.join(self.config.log_directory, 'tmp_' + conn_id)
+
+ if os.path.exists(tmp_dir):
+ shutil.rmtree(tmp_dir)
+
os.mkdir(tmp_dir)
loc_arch_name = os.path.join(tmp_dir, 'wally_result.{0}.tar.gz'.format(conn_id))
file_full_names = " ".join(all_files)
@@ -824,7 +830,13 @@
pass
with node.connection.open_sftp() as sftp:
- exit_code = read_from_remote(sftp, self.exit_code_file)
+ try:
+ exit_code = read_from_remote(sftp, self.exit_code_file)
+ except IOError:
+ logger.error("No exit code file found on %s. Looks like process failed to start",
+ conn_id)
+ return None
+
err_out = read_from_remote(sftp, self.err_out_file)
exit_code = exit_code.strip()