Merge "Make minion restart timeout configurable"
diff --git a/src/com/mirantis/mk/Openstack.groovy b/src/com/mirantis/mk/Openstack.groovy
index 04ce85e..eedfbd8 100644
--- a/src/com/mirantis/mk/Openstack.groovy
+++ b/src/com/mirantis/mk/Openstack.groovy
@@ -478,31 +478,3 @@
 
     salt.runSaltProcessStep(env, 'I@galera:slave', 'service.start', ['mysql'])
 }
-
-/**
- * Recovers
- * @param master Salt master
- * @param recoverHost Hostname of the node to be recovered
- * @param healthyHost Hostname of healthy node from the same cluster
- * @return output of salt commands
- */
-def recoverGluster(master, recoverHost, healthyHost) {
-    def salt = new com.mirantis.mk.Salt()
-
-    // Recover glusterfs
-    if (salt.testTarget(master, 'I@glusterfs:server')) {
-        salt.enforceState(master, 'I@glusterfs:server', 'glusterfs.server.service')
-        if (healthyHost != 'none' && recoverHost != 'none') {
-            salt.runSaltCommand(master, 'local', ['expression': "E@${healthyHost}", 'type': 'compound'], "cp.push /var/lib/glusterd/vols/ upload_path='/tmp/'")
-            salt.runSaltCommand(master, 'local', ['expression': "E@${recoverHost}", 'type': 'compound'], "get_dir salt://tmp/vols/ /var/lib/glusterd/")
-        }
-        salt.enforceState(master, 'I@glusterfs:server and *01*', 'glusterfs.server.setup', true, true, null, false, -1, 5)
-        sleep(10)
-        salt.cmdRun(master, 'I@glusterfs:server', "gluster peer status; gluster volume status")
-    }
-
-    // Ensure glusterfs clusters is ready
-    if (salt.testTarget(master, 'I@glusterfs:client')) {
-        salt.enforceState(master, 'I@glusterfs:client', 'glusterfs.client')
-    }
-}
\ No newline at end of file
diff --git a/src/com/mirantis/mk/Orchestrate.groovy b/src/com/mirantis/mk/Orchestrate.groovy
index b0ecd96..b9ad8ad 100644
--- a/src/com/mirantis/mk/Orchestrate.groovy
+++ b/src/com/mirantis/mk/Orchestrate.groovy
@@ -37,7 +37,9 @@
     } catch (Throwable e) {
         common.warningMsg('Salt state salt.minion.base is not present in the Salt-formula yet.')
     }
-    salt.enforceState(master, '*', ['linux.system'])
+    common.retry(2,5){
+        salt.enforceState(master, '*', ['linux.system'])
+    }
     if (staticMgmtNet) {
         salt.runSaltProcessStep(master, '*', 'cmd.shell', ["salt-call state.sls linux.network; salt-call service.restart salt-minion"], null, true, 60)
     }
@@ -75,7 +77,9 @@
     } catch (Throwable e) {
         common.warningMsg('Salt state salt.minion.base is not present in the Salt-formula yet.')
     }
-    salt.enforceState(master, target, ['linux.system'])
+    common.retry(2,5){
+        salt.enforceState(master, target, ['linux.system'])
+    }
     if (staticMgmtNet) {
         salt.runSaltProcessStep(master, target, 'cmd.shell', ["salt-call state.sls linux.network; salt-call service.restart salt-minion"], null, true, 60)
     }
@@ -94,37 +98,35 @@
 def installInfraKvm(master) {
     def common = new com.mirantis.mk.Common()
     def salt = new com.mirantis.mk.Salt()
-    salt.fullRefresh(master, 'I@linux:system')
-    def infra_conpund = 'I@salt:control'
+    def infra_compound = 'I@salt:control'
     def minions = []
     def wait_timeout = 10
     def retries = wait_timeout * 30
 
+    salt.fullRefresh(master, 'I@linux:system')
     salt.enforceState(master, 'I@salt:control', ['salt.minion'], true, false, null, false, 60, 2)
     salt.enforceState(master, 'I@salt:control', ['linux.system', 'linux.network', 'ntp', 'rsyslog'])
     salt.enforceState(master, 'I@salt:control', 'libvirt')
     salt.enforceState(master, 'I@salt:control', 'salt.control')
 
-    timeout(wait_timeout) {
-        common.infoMsg("Waiting for minions to come up...")
-        if (salt.testTarget(master, infra_conpund)) {
-            // Gathering minions
-            for ( infra_node in salt.getMinionsSorted(master, infra_conpund) ) {
-                def pillar = salt.getPillar(master, infra_node, 'salt:control:cluster')
-                if ( !pillar['return'].isEmpty() ) {
-                    for ( cluster in pillar['return'][0].values() ) {
-                        def engine = cluster.values()[0]['engine']
-                        def domain = cluster.values()[0]['domain']
-                        def node = cluster.values()[0]['node']
-                        if ( engine == "virt" ) {
-                            def nodes = node.values()
-                            if ( !nodes.isEmpty() ) {
-                                for ( vm in nodes ) {
-                                    if ( vm['name'] != null ) {
-                                        def vm_fqdn = vm['name'] + '.' + domain
-                                        if ( !minions.contains(vm_fqdn) ) {
-                                            minions.add(vm_fqdn)
-                                        }
+    common.infoMsg("Building minions list...")
+    if (salt.testTarget(master, infra_compound)) {
+        // Gathering minions
+        for ( infra_node in salt.getMinionsSorted(master, infra_compound) ) {
+            def pillar = salt.getPillar(master, infra_node, 'salt:control:cluster')
+            if ( !pillar['return'].isEmpty() ) {
+                for ( cluster in pillar['return'][0].values() ) {
+                    def engine = cluster.values()[0]['engine']
+                    def domain = cluster.values()[0]['domain']
+                    def node = cluster.values()[0]['node']
+                    if ( engine == "virt" ) {
+                        def nodes = node.values()
+                        if ( !nodes.isEmpty() ) {
+                            for ( vm in nodes ) {
+                                if ( vm['name'] != null ) {
+                                    def vm_fqdn = vm['name'] + '.' + domain
+                                    if ( !minions.contains(vm_fqdn) ) {
+                                        minions.add(vm_fqdn)
                                     }
                                 }
                             }
@@ -133,13 +135,18 @@
                 }
             }
         }
+    }
 
-        def minions_compound = minions.join(' or ')
-        common.infoMsg('Waiting for next minions to register: ' + minions_compound,)
+    def minions_compound = minions.join(' or ')
+
+    common.infoMsg("Waiting for next minions to register within ${wait_timeout} minutes: " + minions_compound)
+    timeout(time: wait_timeout, unit: 'MINUTES') {
         salt.minionsPresentFromList(master, 'I@salt:master', minions, true, null, true, retries, 1)
-        common.infoMsg('Waiting for minions to respond')
-        salt.minionsReachable(master, 'I@salt:master', minions_compound )
+    }
 
+    common.infoMsg('Waiting for minions to respond')
+    timeout(time: wait_timeout, unit: 'MINUTES') {
+        salt.minionsReachable(master, 'I@salt:master', minions_compound)
     }
 
     common.infoMsg("All minions are up.")
@@ -771,6 +778,8 @@
 def installStacklight(master) {
     def common = new com.mirantis.mk.Common()
     def salt = new com.mirantis.mk.Salt()
+    def retries_wait = 20
+    def retries = 15
 
     // Install core services for K8S environments:
     // HAProxy, Nginx and lusterFS clients
@@ -790,18 +799,14 @@
 
     // Install MongoDB for Alerta
     if (salt.testTarget(master, 'I@mongodb:server')) {
-        salt.enforceState(master, 'I@mongodb:server', 'mongodb')
-    }
+        salt.enforceState(master, 'I@mongodb:server', 'mongodb.server')
 
-    // Configure Alerta
-    if (salt.testTarget(master, 'I@prometheus:alerta')) {
-        salt.enforceState(master, 'I@docker:swarm and I@prometheus:alerta', 'prometheus.alerta')
+        // Initialize mongodb replica set
+        common.retry(5,20){
+             salt.enforceState(master, 'I@mongodb:server', 'mongodb.cluster')
+        }
     }
 
-    // Launch containers
-    salt.enforceState(master, 'I@docker:swarm:role:master and I@prometheus:server', 'docker.client')
-    salt.runSaltProcessStep(master, 'I@docker:swarm and I@prometheus:server', 'dockerng.ps')
-
     //Install Telegraf
     salt.enforceState(master, 'I@telegraf:agent or I@telegraf:remote_agent', 'telegraf')
 
@@ -815,8 +820,34 @@
     salt.enforceState(master, 'I@elasticsearch:server', 'elasticsearch.server')
     salt.enforceState(master, '*01* and I@kibana:server', 'kibana.server')
     salt.enforceState(master, 'I@kibana:server', 'kibana.server')
-    salt.enforceState(master, 'I@elasticsearch:client', 'elasticsearch.client')
-    salt.enforceState(master, 'I@kibana:client', 'kibana.client')
+
+    // Check ES health cluster status
+    def pillar = salt.getPillar(master, 'I@elasticsearch:client', 'elasticsearch:client:server:host')
+    def elasticsearch_vip
+    if(!pillar['return'].isEmpty()) {
+        elasticsearch_vip = pillar['return'][0].values()[0]
+    } else {
+        common.errorMsg('[ERROR] Elasticsearch VIP address could not be retrieved')
+    }
+    pillar = salt.getPillar(master, 'I@elasticsearch:client', 'elasticsearch:client:server:port')
+    def elasticsearch_port
+    if(!pillar['return'].isEmpty()) {
+        elasticsearch_port = pillar['return'][0].values()[0]
+    } else {
+        common.errorMsg('[ERROR] Elasticsearch VIP port could not be retrieved')
+    }
+    common.retry(retries,retries_wait) {
+        common.infoMsg('Waiting for Elasticsearch to become green..')
+        salt.cmdRun(master, 'I@elasticsearch:client', "curl -sf ${elasticsearch_vip}:${elasticsearch_port}/_cat/health | awk '{print \$4}' | grep green")
+    }
+
+    common.retry(retries,retries_wait) {
+        salt.enforceState(master, 'I@elasticsearch:client', 'elasticsearch.client')
+    }
+
+    common.retry(retries,retries_wait) {
+        salt.enforceState(master, 'I@kibana:client', 'kibana.client')
+    }
 
     //Install InfluxDB
     if (salt.testTarget(master, 'I@influxdb:server')) {
@@ -824,11 +855,6 @@
         salt.enforceState(master, 'I@influxdb:server', 'influxdb')
     }
 
-    //Install Prometheus LTS
-    if (salt.testTarget(master, 'I@prometheus:relay')) {
-        salt.enforceState(master, 'I@prometheus:relay', 'prometheus')
-    }
-
     // Install service for the log collection
     if (salt.testTarget(master, 'I@fluentd:agent')) {
         salt.enforceState(master, 'I@fluentd:agent', 'fluentd')
@@ -866,13 +892,22 @@
         salt.enforceState(master, 'I@docker:swarm and I@prometheus:server', 'heka.remote_collector', true, false)
     }
 
+    // Launch containers
+    salt.enforceState(master, 'I@docker:swarm:role:master and I@prometheus:server', 'docker.client')
+    salt.runSaltProcessStep(master, 'I@docker:swarm and I@prometheus:server', 'dockerng.ps')
+
+    //Install Prometheus LTS
+    if (salt.testTarget(master, 'I@prometheus:relay')) {
+        salt.enforceState(master, 'I@prometheus:relay', 'prometheus')
+    }
+
     // Install sphinx server
     if (salt.testTarget(master, 'I@sphinx:server')) {
         salt.enforceState(master, 'I@sphinx:server', 'sphinx')
     }
 
     //Configure Grafana
-    def pillar = salt.getPillar(master, 'ctl01*', '_param:stacklight_monitor_address')
+    pillar = salt.getPillar(master, 'ctl01*', '_param:stacklight_monitor_address')
     common.prettyPrint(pillar)
 
     def stacklight_vip
diff --git a/src/com/mirantis/mk/Salt.groovy b/src/com/mirantis/mk/Salt.groovy
index 46f27a6..188af61 100644
--- a/src/com/mirantis/mk/Salt.groovy
+++ b/src/com/mirantis/mk/Salt.groovy
@@ -270,27 +270,35 @@
     if (waitUntilPresent){
         def count = 0
         while(count < maxRetries) {
+            try {
+                def out = runSaltCommand(saltId, 'local', ['expression': target, 'type': 'compound'], 'cmd.shell', batch, [cmd], null, 5)
+                if (output) {
+                    printSaltCommandResult(out)
+                }
+                def valueMap = out["return"][0]
+                def result = valueMap.get(valueMap.keySet()[0])
+                def resultsArray = result.tokenize("\n")
+                def size = resultsArray.size()
+                if (size >= answers) {
+                    return out
+                }
+                count++
+                sleep(time: 1000, unit: 'MILLISECONDS')
+                common.infoMsg("Waiting for ${cmd} on ${target} to be in correct state")
+            } catch (Exception er) {
+                common.infoMsg('[WARNING]: runSaltCommand command read timeout within 5 seconds. You have very slow or broken environment')
+            }
+        }
+    } else {
+        try {
             def out = runSaltCommand(saltId, 'local', ['expression': target, 'type': 'compound'], 'cmd.shell', batch, [cmd], null, 5)
             if (output) {
                 printSaltCommandResult(out)
             }
-            def valueMap = out["return"][0]
-            def result = valueMap.get(valueMap.keySet()[0])
-            def resultsArray = result.tokenize("\n")
-            def size = resultsArray.size()
-            if (size >= answers) {
-                return out
-            }
-            count++
-            sleep(time: 1000, unit: 'MILLISECONDS')
-            common.infoMsg("Waiting for ${cmd} on ${target} to be in correct state")
+            return out
+        } catch (Exception er) {
+            common.infoMsg('[WARNING]: runSaltCommand command read timeout within 5 seconds. You have very slow or broken environment')
         }
-    } else {
-        def out = runSaltCommand(saltId, 'local', ['expression': target, 'type': 'compound'], 'cmd.shell', batch, [cmd], null, 5)
-        if (output) {
-            printSaltCommandResult(out)
-        }
-        return out
     }
     // otherwise throw exception
     common.errorMsg("Status of command ${cmd} on ${target} failed, please check it.")