ceph upgrade fix

PROD-15484
PROD-15483

Change-Id: I1e344b49407c07599da83aa52f930882910c208d
diff --git a/ceph-upgrade.groovy b/ceph-upgrade.groovy
index db6bea3..5844f77 100644
--- a/ceph-upgrade.groovy
+++ b/ceph-upgrade.groovy
@@ -30,40 +30,100 @@
     return salt.cmdRun(master, target, cmd)
 }
 
+def waitForHealthy(master, count=0, attempts=300) {
+    // wait for healthy cluster
+    while (count<attempts) {
+        def health = runCephCommand(master, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
+        if (health.contains('HEALTH_OK')) {
+            common.infoMsg('Cluster is healthy')
+            break;
+        }
+        count++
+        sleep(10)
+    }
+}
+
+def backup(master, target) {
+    stage("backup ${target}") {
+
+        def _pillar = salt.getGrain(master, 'I@salt:master', 'domain')
+        def domain = _pillar['return'][0].values()[0].values()[0]
+
+        def kvm_pillar = salt.getGrain(master, 'I@salt:control', 'id')
+        def kvm01 = kvm_pillar['return'][0].values()[0].values()[0]
+
+        def target_pillar = salt.getGrain(master, "I@ceph:${target}", 'host')
+        def minions = target_pillar['return'][0].values()
+        for (minion in minions) {
+            def minion_name = minion.values()[0]
+            def provider_pillar = salt.getPillar(master, "${kvm01}", "salt:control:cluster:internal:node:${minion_name}:provider")
+            def minionProvider = provider_pillar['return'][0].values()[0]
+
+            waitForHealthy(master)
+            try {
+                salt.cmdRun(master, "${minionProvider}", "[ ! -f /root/${minion_name}.${domain}.qcow2.bak ] && virsh destroy ${minion_name}.${domain}")
+            } catch (Exception e) {
+                common.warningMsg('Backup already exists')
+            }
+            try {
+                salt.cmdRun(master, "${minionProvider}", "[ ! -f /root/${minion_name}.${domain}.qcow2.bak ] && cp /var/lib/libvirt/images/${minion_name}.${domain}/system.qcow2 /root/${minion_name}.${domain}.qcow2.bak")
+            } catch (Exception e) {
+                common.warningMsg('Backup already exists')
+            }
+            try {
+                salt.cmdRun(master, "${minionProvider}", "virsh start ${minion_name}.${domain}")
+            } catch (Exception e) {
+                common.warningMsg(e)
+            }
+            salt.minionsReachable(master, 'I@salt:master', "${minion_name}*")
+            waitForHealthy(master)
+        }
+    }
+    return
+}
+
 def upgrade(master, target) {
 
     stage("Change ${target} repos") {
         salt.runSaltProcessStep(master, "I@ceph:${target}", 'saltutil.refresh_pillar', [], null, true, 5)
         salt.enforceState(master, "I@ceph:${target}", 'linux.system.repo', true)
     }
-
     if (target == 'mgr') {
         stage('Run ceph mgr state') {
             salt.enforceState(master, "I@ceph:mgr", "ceph.mgr", true)
         }
     }
-
     if (target == 'common') {
         stage('Upgrade ceph-common pkgs') {
-            runCephCommand(master, "I@ceph:${target}", "apt install ceph-${target} -y ")
-        }
-    } else if (target == 'radosgw') {
-        stage('Upgrade radosgw pkgs') {
-            runCephCommand(master, "I@ceph:${target}", "apt install ${target} -y ")
-        }
-        // restart services
-        stage("Restart ${target} services") {
-            runCephCommand(master, "I@ceph:${target}", "systemctl restart ceph-${target}.target")
+            runCephCommand(master, "I@ceph:${target}", "apt install ceph-${target} -y")
         }
     } else {
+        minions = salt.getMinions(master, "I@ceph:${target}")
 
-        // upgrade pkgs
-        stage("Upgrade ${target} pkgs") {
-            runCephCommand(master, "I@ceph:${target}", "apt install ceph-${target} -y ")
-        }
-        // restart services
-        stage("Restart ${target} services") {
-            runCephCommand(master, "I@ceph:${target}", "systemctl restart ceph-${target}.target")
+        for (minion in minions) {
+            // upgrade pkgs
+            if (target == 'radosgw') {
+                stage('Upgrade radosgw pkgs') {
+                    runCephCommand(master, "I@ceph:${target}", "apt install ${target} -y ")
+                }
+            } else {
+                stage("Upgrade ${target} pkgs on ${minion}") {
+                    runCephCommand(master, "${minion}", "apt install ceph-${target} -y")
+                }
+            }
+            // restart services
+            stage("Restart ${target} services on ${minion}") {
+                runCephCommand(master, "${minion}", "systemctl restart ceph-${target}.target")
+            }
+
+            stage("Verify services for ${minion}") {
+                sleep(10)
+                runCephCommand(master, ADMIN_HOST, "ceph -s")
+            }
+
+            stage('Ask for manual confirmation') {
+                input message: "From the verification command above, please check Ceph ${target} joined the cluster correctly. If so, Do you want to continue to upgrade next node?"
+            }
         }
     }
     runCephCommand(master, ADMIN_HOST, "ceph versions")
@@ -76,6 +136,15 @@
     // create connection to salt master
     python.setupPepperVirtualenv(pepperEnv, SALT_MASTER_URL, SALT_MASTER_CREDENTIALS)
 
+    if (BACKUP_ENABLED.toBoolean() == true) {
+        if (STAGE_UPGRADE_MON.toBoolean() == true) {
+            backup(pepperEnv, 'mon')
+        }
+        if (STAGE_UPGRADE_RGW.toBoolean() == true) {
+            backup(pepperEnv, 'radosgw')
+        }
+    }
+
     if (flags.size() > 0) {
         stage('Set cluster flags') {
             for (flag in flags) {
@@ -86,42 +155,18 @@
 
     if (STAGE_UPGRADE_MON.toBoolean() == true) {
         upgrade(pepperEnv, 'mon')
-        stage("Verify mon services") {
-            runCephCommand(pepperEnv, ADMIN_HOST, "ceph mon stat")
-        }
-        stage('Ask for manual confirmation') {
-            input message: "From the verification command above, please check Ceph mons joined the cluster. If so, Do you want to continue?"
-        }
     }
 
     if (STAGE_UPGRADE_MGR.toBoolean() == true) {
         upgrade(pepperEnv, 'mgr')
-        stage("Verify mgr services") {
-            runCephCommand(pepperEnv, ADMIN_HOST, "ceph -s")
-        }
-        stage('Ask for manual confirmation') {
-            input message: "From the verification command above, please check Ceph mgr joined the cluster. If so, Do you want to continue?"
-        }
     }
 
     if (STAGE_UPGRADE_OSD.toBoolean() == true) {
         upgrade(pepperEnv, 'osd')
-        stage("Verify osd services") {
-            runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd stat")
-        }
-        stage('Ask for manual confirmation') {
-            input message: "From the verification command above, please check Ceph osds joined the cluster. If so, Do you want to continue?"
-        }
     }
 
     if (STAGE_UPGRADE_RGW.toBoolean() == true) {
         upgrade(pepperEnv, 'radosgw')
-        stage("Verify rgw services") {
-            runCephCommand(pepperEnv, ADMIN_HOST, "ceph -s")
-        }
-        stage('Ask for manual confirmation') {
-            input message: "From the verification command above, please check Ceph rgw joined the cluster. If so, Do you want to continue?"
-        }
     }
 
     if (STAGE_UPGRADE_CLIENT.toBoolean() == true) {
@@ -141,27 +186,24 @@
         }
     }
 
-    stage("Finalize ceph version upgrade") {
-        runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd require-osd-release ${TARGET_RELEASE}")
-        try {
-            runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd set-require-min-compat-client ${ORIGIN_RELEASE}")
-        } catch (Exception e) {
-            common.warningMsg(e)
+    if (STAGE_FINALIZE.toBoolean() == true) {
+        stage("Finalize ceph version upgrade") {
+            runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd require-osd-release ${TARGET_RELEASE}")
+            try {
+                runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd set-require-min-compat-client ${ORIGIN_RELEASE}")
+            } catch (Exception e) {
+                common.warningMsg(e)
+            }
+            try {
+                runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd crush tunables optimal")
+            } catch (Exception e) {
+                common.warningMsg(e)
+            }
         }
-        runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd crush tunables optimal")
     }
 
     // wait for healthy cluster
     if (WAIT_FOR_HEALTHY.toBoolean() == true) {
-        stage('Waiting for healthy cluster') {
-            while (true) {
-                def health = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph -s')['return'][0].values()[0]
-                if (health.contains('HEALTH_OK')) {
-                    common.infoMsg('Cluster is healthy')
-                    break;
-                }
-                sleep(10)
-            }
-        }
+        waitForHealthy(pepperEnv)
     }
 }