Skipping nodes, error handling
diff --git a/cfg_checker/common/salt_utils.py b/cfg_checker/common/salt_utils.py
index d4bd007..b913531 100644
--- a/cfg_checker/common/salt_utils.py
+++ b/cfg_checker/common/salt_utils.py
@@ -86,7 +86,7 @@
self._token = self._login()
self.last_response = None
- def get(self, path='', headers=default_headers, cookies=None):
+ def get(self, path='', headers=default_headers, cookies=None, timeout=None):
_path = os.path.join(self.uri, path)
logger.debug("# GET '{}'\nHeaders: '{}'\nCookies: {}".format(
_path,
@@ -96,7 +96,8 @@
return requests.get(
_path,
headers=headers,
- cookies=cookies
+ cookies=cookies,
+ timeout=timeout
)
def post(self, data, path='', headers=default_headers, cookies=None):
@@ -286,7 +287,12 @@
Works starting from 2017.7.7
api returns dict of minions with grains
"""
- return self.salt_request('get', 'minions')[0]
+ try:
+ _r = self.salt_request('get', 'minions', timeout=10)
+ except requests.exceptions.ReadTimeout as e:
+ logger_cli.debug("... timeout waiting list minions from Salt API")
+ _r = None
+ return _r[0] if _r else None
def list_keys(self):
"""
@@ -321,7 +327,7 @@
"""
if config.skip_nodes:
logger.info("# Nodes to be skipped: {0}".format(config.skip_nodes))
- return self.cmd(
+ _r = self.cmd(
'* and not ' + list_to_target_string(
config.skip_nodes,
'and not'
@@ -329,7 +335,9 @@
'test.ping',
expr_form='compound')
else:
- return self.cmd('*', 'test.ping')
+ _r = self.cmd('*', 'test.ping')
+ # Return all nodes that responded
+ return [node for node in _r.keys() if _r[node]]
def get_monitoring_ip(self, param_name):
salt_output = self.cmd(
diff --git a/cfg_checker/modules/network/checker.py b/cfg_checker/modules/network/checker.py
index 7a654dc..5230039 100644
--- a/cfg_checker/modules/network/checker.py
+++ b/cfg_checker/modules/network/checker.py
@@ -40,6 +40,9 @@
_result = self.execute_script_on_active_nodes("ifs_data.py", args=["json"])
for key in self.nodes.keys():
+ # check if we are to work with this node
+ if not self.is_node_available(key):
+ continue
# due to much data to be passed from salt, it is happening in order
if key in _result:
_text = _result[key]
@@ -59,6 +62,9 @@
# match interfaces by IP subnets
_all_nets = {}
for host, node_data in self.nodes.iteritems():
+ if not self.is_node_available(host):
+ continue
+
for net_name, net_data in node_data['networks'].iteritems():
# get ips and calculate subnets
if net_name in ['lo']:
@@ -89,7 +95,12 @@
# Get required pillars
self.get_specific_pillar_for_nodes("linux:network")
for node in self.nodes.keys():
- _pillar = self.nodes[node]['pillars']['linux']['network']['interface']
+ # check if this node
+ if not self.is_node_available(node):
+ continue
+ # get the reclass value
+ _pillar = self.nodes[node]['pillars']['linux']['network']
+ _pillar = _pillar['interface']
for _if_name, _if_data in _pillar.iteritems():
if 'address' in _if_data:
_if = ipaddress.IPv4Interface(
@@ -116,17 +127,19 @@
"""
_all_nets = self.all_nets.keys()
logger_cli.info("# Reclass networks")
- _text = " {0:17} {1:25}: {2:19} {3:5}{4:10} {5}{6} {7}/{8}/{9}".format(
- "Hostname",
- "IF name",
- "IP",
- "Runtime MTU",
- "Reclass MTU",
- "Runtime State",
- "Reclass State",
- "Runtime gate",
- "Runtime def. gate",
- "Reclass gate"
+ logger_cli.info(
+ " {0:17} {1:25}: {2:19} {3:5}{4:10} {5}{6} {7} / {8} / {9}".format(
+ "Hostname",
+ "IF",
+ "IP",
+ "rtMTU",
+ "rcMTU",
+ "rtState",
+ "rcState",
+ "rtGate",
+ "rtDef.Gate",
+ "rcGate"
+ )
)
_reclass = [n for n in _all_nets if n in self.reclass_nets]
@@ -136,10 +149,21 @@
logger_cli.info("-> {}".format(_net))
names = sorted(self.all_nets[network].keys())
for hostname in names:
+ if not self.is_node_available(hostname, log=False):
+ logger_cli.info(
+ " {0:17} {1}".format(
+ hostname.split('.')[0],
+ "... no data for the node"
+ )
+ )
+
# get the gateway for current net
_routes = self.nodes[hostname]['routes']
_route = _routes[_net] if _net in _routes else None
- _gate = _route['gateway'] if _route['gateway'] else "empty"
+ if not _route:
+ _gate = "no route!"
+ else:
+ _gate = _route['gateway'] if _route['gateway'] else "empty"
# get the default gateway
if 'default' in _routes:
@@ -156,10 +180,16 @@
# Take gateway parameter for this IF
# from corresponding reclass record
_pillar = self.nodes[hostname]['pillars']
- _rd = _pillar['linux']['network']['interface'][_a['name']]
- _r_gate = _rd['gateway'] if 'gateway' in _rd else "empty"
+ _pillar = _pillar['linux']['network']['interface']
+ if not self.is_node_available(hostname):
+ _r_gate = "-"
+ elif _a['name'] not in _pillar:
+ _r_gate = "no IF in reclass!"
+ else:
+ _rd = _pillar[_a['name']]
+ _r_gate = _rd['gateway'] if 'gateway' in _rd else "empty"
- _text = "{0:25}: {1:19} {2:5}{3:10} {4:4}{5:10} {6}/{7}/{8}".format(
+ _text = "{0:25}: {1:19} {2:5}{3:10} {4:4}{5:10} {6} / {7} / {8}".format(
_a['name'],
str(_a['if'].ip),
_a['mtu'],
diff --git a/cfg_checker/nodes.py b/cfg_checker/nodes.py
index 798a8e5..30de749 100644
--- a/cfg_checker/nodes.py
+++ b/cfg_checker/nodes.py
@@ -57,17 +57,22 @@
# just inventory for faster interaction
# iterate through all accepted nodes and create a dict for it
self.nodes = {}
+ self.skip_list = []
for _name in _minions:
_nc = utils.get_node_code(_name)
_rmap = const.all_roles_map
_role = _rmap[_nc] if _nc in _rmap else 'unknown'
_status = const.NODE_UP if _name in _active else const.NODE_DOWN
-
+ if _status == const.NODE_DOWN:
+ self.skip_list.append(_name)
+ logger_cli.info("-> '{}' is down, marked to skip".format(
+ _name
+ ))
self.nodes[_name] = deepcopy(node_tmpl)
self.nodes[_name]['node_group'] = _nc
self.nodes[_name]['role'] = _role
self.nodes[_name]['status'] = _status
-
+ logger_cli.info("-> {} nodes inactive".format(len(self.skip_list)))
logger_cli.info("-> {} nodes collected".format(len(self.nodes)))
# form an all nodes compound string to use in salt
@@ -88,7 +93,16 @@
"""
logger_cli.debug("...collecting node pillars for '{}'".format(pillar_path))
_result = self.salt.pillar_get(self.active_nodes_compound, pillar_path)
+ self.not_responded = []
for node, data in self.nodes.iteritems():
+ if node in self.skip_list:
+ logger_cli.debug(
+ "... '{}' skipped while collecting '{}'".format(
+ node,
+ pillar_path
+ )
+ )
+ continue
_pillar_keys = pillar_path.split(':')
_data = data['pillars']
# pre-create nested dict
@@ -97,7 +111,19 @@
if _key not in _data:
_data[_key] = {}
_data = _data[_key]
- _data[_pillar_keys[-1]] = _result[node]
+ if data['status'] == const.NODE_DOWN:
+ _data[_pillar_keys[-1]] = None
+ elif not _result[node]:
+ logger_cli.debug(
+ "... '{}' not responded after '{}'".format(
+ node,
+ config.salt_timeout
+ )
+ )
+ _data[_pillar_keys[-1]] = None
+ self.not_responded.append(node)
+ else:
+ _data[_pillar_keys[-1]] = _result[node]
def execute_script_on_active_nodes(self, script_filename, args=[]):
# Prepare script
@@ -151,13 +177,27 @@
logger.debug("Running script to all nodes")
# handle results for each node
_script_arguments = " ".join(args) if args else ""
- _result = self.salt.cmd(
+ self.not_responded = []
+ _r = self.salt.cmd(
self.active_nodes_compound,
'cmd.run',
param='python {} {}'.format(_target_path, _script_arguments),
expr_form="compound"
)
- # TODO: Handle error result
+ # all false returns means that there is no response
+ self.not_responded = [_n for _n in _r.keys() if not _r[_n]]
+ return _r
- return _result
+ def is_node_available(self, node, log=True):
+ if node in self.skip_list:
+ if log:
+ logger_cli.info("-> node '{}' not active".format(node))
+ return False
+ elif node in self.not_responded:
+ if log:
+ logger_cli.info("-> node '{}' not responded".format(node))
+ return False
+ else:
+ return True
+