ceph upgrade fix

PROD-15484
PROD-15483

Change-Id: I1e344b49407c07599da83aa52f930882910c208d
diff --git a/ceph-backend-migration.groovy b/ceph-backend-migration.groovy
index 0e9b6b7..8b8d256 100644
--- a/ceph-backend-migration.groovy
+++ b/ceph-backend-migration.groovy
@@ -12,6 +12,8 @@
  *  CLUSTER_FLAGS                   Comma separated list of tags to apply to cluster
  *  WAIT_FOR_HEALTHY                Wait for cluster rebalance before stoping daemons
  *  ORIGIN_BACKEND                  Ceph backend before upgrade
+ *  PER_OSD_CONTROL                 Set to true if Ceph status verification after every osd disk migration is desired
+ *  PER_OSD_HOST_CONTROL            Set to true if Ceph status verificaton after whole OSD host migration is desired
  *
  */
 
@@ -30,6 +32,19 @@
     return salt.cmdRun(master, target, cmd)
 }
 
+def waitForHealthy(master, count=0, attempts=300) {
+    // wait for healthy cluster
+    while (count<attempts) {
+        def health = runCephCommand(master, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
+        if (health.contains('HEALTH_OK')) {
+            common.infoMsg('Cluster is healthy')
+            break;
+        }
+        count++
+        sleep(10)
+    }
+}
+
 node("python") {
 
     // create connection to salt master
@@ -72,16 +87,7 @@
 
                     // wait for healthy cluster before manipulating with osds
                     if (WAIT_FOR_HEALTHY.toBoolean() == true) {
-                        stage('Waiting for healthy cluster') {
-                            while (true) {
-                                def health = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
-                                if (health.contains('HEALTH_OK')) {
-                                    common.infoMsg('Cluster is healthy')
-                                    break;
-                                }
-                                sleep(5)
-                            }
-                        }
+                        waitForHealthy(pepperEnv)
                     }
 
                     // `ceph osd out <id> <id>`
@@ -89,19 +95,9 @@
                             runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd out ${osd_id}")
                     }
 
-                    // wait for healthy cluster
                     if (WAIT_FOR_HEALTHY.toBoolean() == true) {
-                        stage('Waiting for healthy cluster') {
-                            sleep(5)
-                            while (true) {
-                                def health = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
-                                if (health.contains('HEALTH_OK')) {
-                                    common.infoMsg('Cluster is healthy')
-                                    break;
-                                }
-                                sleep(10)
-                            }
-                        }
+                        sleep(5)
+                        waitForHealthy(pepperEnv)
                     }
 
                     // stop osd daemons
@@ -182,8 +178,32 @@
                         salt.runSaltProcessStep(pepperEnv, HOST, 'saltutil.refresh_pillar', [], null, true, 5)
                         salt.enforceState(pepperEnv, HOST, 'ceph.osd', true)
                     }
+
+                    if (PER_OSD_CONTROL.toBoolean() == true) {
+                        stage("Verify backend version for osd.${id}") {
+                            sleep(5)
+                            runCephCommand(pepperEnv, HOST, "ceph osd metadata ${id} | grep osd_objectstore")
+                            runCephCommand(pepperEnv, HOST, "ceph -s")
+                        }
+
+                        stage('Ask for manual confirmation') {
+                            input message: "From the verification commands above, please check the backend version of osd.${id} and ceph status. If it is correct, Do you want to continue to migrate next osd?"
+                        }
+                    }
                 }
             }
+            if (PER_OSD_HOST_CONTROL.toBoolean() == true) {
+                stage("Verify backend versions") {
+                    sleep(5)
+                    runCephCommand(pepperEnv, HOST, "ceph osd metadata | grep osd_objectstore -B2")
+                    runCephCommand(pepperEnv, HOST, "ceph -s")
+                }
+
+                stage('Ask for manual confirmation') {
+                    input message: "From the verification command above, please check the ceph status and backend version of osds on this host. If it is correct, Do you want to continue to migrate next OSD host?"
+                }
+            }
+
         }
         // remove cluster flags
         if (flags.size() > 0) {
diff --git a/ceph-remove-node.groovy b/ceph-remove-node.groovy
index bc0b09d..cda53be 100644
--- a/ceph-remove-node.groovy
+++ b/ceph-remove-node.groovy
@@ -24,6 +24,19 @@
     return salt.cmdRun(master, target, cmd)
 }
 
+def waitForHealthy(master, count=0, attempts=300) {
+    // wait for healthy cluster
+    while (count<attempts) {
+        def health = runCephCommand(master, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
+        if (health.contains('HEALTH_OK')) {
+            common.infoMsg('Cluster is healthy')
+            break;
+        }
+        count++
+        sleep(10)
+    }
+}
+
 node("python") {
 
     // create connection to salt master
@@ -91,17 +104,8 @@
 
         // wait for healthy cluster
         if (WAIT_FOR_HEALTHY.toBoolean() == true) {
-            stage('Waiting for healthy cluster') {
-                sleep(5)
-                while (true) {
-                    def health = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
-                    if (health.contains('HEALTH_OK')) {
-                        common.infoMsg('Cluster is healthy')
-                        break;
-                    }
-                    sleep(10)
-                }
-            }
+            sleep(5)
+            waitForHealthy(pepperEnv)
         }
 
         // stop osd daemons
diff --git a/ceph-remove-osd.groovy b/ceph-remove-osd.groovy
index c5610eb..c51292e 100644
--- a/ceph-remove-osd.groovy
+++ b/ceph-remove-osd.groovy
@@ -26,6 +26,19 @@
     return salt.cmdRun(master, ADMIN_HOST, cmd)
 }
 
+def waitForHealthy(master, count=0, attempts=300) {
+    // wait for healthy cluster
+    while (count<attempts) {
+        def health = runCephCommand(master, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
+        if (health.contains('HEALTH_OK')) {
+            common.infoMsg('Cluster is healthy')
+            break;
+        }
+        count++
+        sleep(10)
+    }
+}
+
 node("python") {
 
     // create connection to salt master
@@ -65,17 +78,8 @@
 
     // wait for healthy cluster
     if (WAIT_FOR_HEALTHY.toBoolean() == true) {
-        stage('Waiting for healthy cluster') {
-            sleep(5)
-            while (true) {
-                def health = runCephCommand(pepperEnv, 'ceph health')['return'][0].values()[0]
-                if (health.contains('HEALTH_OK')) {
-                    common.infoMsg('Cluster is healthy')
-                    break;
-                }
-                sleep(10)
-            }
-        }
+        sleep(5)
+        waitForHealthy(pepperEnv)
     }
 
     // stop osd daemons
diff --git a/ceph-replace-failed-osd.groovy b/ceph-replace-failed-osd.groovy
index ee4ef38..9127581 100644
--- a/ceph-replace-failed-osd.groovy
+++ b/ceph-replace-failed-osd.groovy
@@ -31,6 +31,19 @@
     return salt.cmdRun(master, target, cmd)
 }
 
+def waitForHealthy(master, count=0, attempts=300) {
+    // wait for healthy cluster
+    while (count<attempts) {
+        def health = runCephCommand(master, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
+        if (health.contains('HEALTH_OK')) {
+            common.infoMsg('Cluster is healthy')
+            break;
+        }
+        count++
+        sleep(10)
+    }
+}
+
 node("python") {
 
     // create connection to salt master
@@ -70,17 +83,8 @@
 
     // wait for healthy cluster
     if (WAIT_FOR_HEALTHY.toBoolean() == true) {
-        stage('Waiting for healthy cluster') {
-            sleep(5)
-            while (true) {
-                def health = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
-                if (health.contains('HEALTH_OK')) {
-                    common.infoMsg('Cluster is healthy')
-                    break;
-                }
-                sleep(10)
-            }
-        }
+        sleep(5)
+        waitForHealthy(pepperEnv)
     }
 
     // stop osd daemons
diff --git a/ceph-upgrade.groovy b/ceph-upgrade.groovy
index db6bea3..5844f77 100644
--- a/ceph-upgrade.groovy
+++ b/ceph-upgrade.groovy
@@ -30,40 +30,100 @@
     return salt.cmdRun(master, target, cmd)
 }
 
+def waitForHealthy(master, count=0, attempts=300) {
+    // wait for healthy cluster
+    while (count<attempts) {
+        def health = runCephCommand(master, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
+        if (health.contains('HEALTH_OK')) {
+            common.infoMsg('Cluster is healthy')
+            break;
+        }
+        count++
+        sleep(10)
+    }
+}
+
+def backup(master, target) {
+    stage("backup ${target}") {
+
+        def _pillar = salt.getGrain(master, 'I@salt:master', 'domain')
+        def domain = _pillar['return'][0].values()[0].values()[0]
+
+        def kvm_pillar = salt.getGrain(master, 'I@salt:control', 'id')
+        def kvm01 = kvm_pillar['return'][0].values()[0].values()[0]
+
+        def target_pillar = salt.getGrain(master, "I@ceph:${target}", 'host')
+        def minions = target_pillar['return'][0].values()
+        for (minion in minions) {
+            def minion_name = minion.values()[0]
+            def provider_pillar = salt.getPillar(master, "${kvm01}", "salt:control:cluster:internal:node:${minion_name}:provider")
+            def minionProvider = provider_pillar['return'][0].values()[0]
+
+            waitForHealthy(master)
+            try {
+                salt.cmdRun(master, "${minionProvider}", "[ ! -f /root/${minion_name}.${domain}.qcow2.bak ] && virsh destroy ${minion_name}.${domain}")
+            } catch (Exception e) {
+                common.warningMsg('Backup already exists')
+            }
+            try {
+                salt.cmdRun(master, "${minionProvider}", "[ ! -f /root/${minion_name}.${domain}.qcow2.bak ] && cp /var/lib/libvirt/images/${minion_name}.${domain}/system.qcow2 /root/${minion_name}.${domain}.qcow2.bak")
+            } catch (Exception e) {
+                common.warningMsg('Backup already exists')
+            }
+            try {
+                salt.cmdRun(master, "${minionProvider}", "virsh start ${minion_name}.${domain}")
+            } catch (Exception e) {
+                common.warningMsg(e)
+            }
+            salt.minionsReachable(master, 'I@salt:master', "${minion_name}*")
+            waitForHealthy(master)
+        }
+    }
+    return
+}
+
 def upgrade(master, target) {
 
     stage("Change ${target} repos") {
         salt.runSaltProcessStep(master, "I@ceph:${target}", 'saltutil.refresh_pillar', [], null, true, 5)
         salt.enforceState(master, "I@ceph:${target}", 'linux.system.repo', true)
     }
-
     if (target == 'mgr') {
         stage('Run ceph mgr state') {
             salt.enforceState(master, "I@ceph:mgr", "ceph.mgr", true)
         }
     }
-
     if (target == 'common') {
         stage('Upgrade ceph-common pkgs') {
-            runCephCommand(master, "I@ceph:${target}", "apt install ceph-${target} -y ")
-        }
-    } else if (target == 'radosgw') {
-        stage('Upgrade radosgw pkgs') {
-            runCephCommand(master, "I@ceph:${target}", "apt install ${target} -y ")
-        }
-        // restart services
-        stage("Restart ${target} services") {
-            runCephCommand(master, "I@ceph:${target}", "systemctl restart ceph-${target}.target")
+            runCephCommand(master, "I@ceph:${target}", "apt install ceph-${target} -y")
         }
     } else {
+        minions = salt.getMinions(master, "I@ceph:${target}")
 
-        // upgrade pkgs
-        stage("Upgrade ${target} pkgs") {
-            runCephCommand(master, "I@ceph:${target}", "apt install ceph-${target} -y ")
-        }
-        // restart services
-        stage("Restart ${target} services") {
-            runCephCommand(master, "I@ceph:${target}", "systemctl restart ceph-${target}.target")
+        for (minion in minions) {
+            // upgrade pkgs
+            if (target == 'radosgw') {
+                stage('Upgrade radosgw pkgs') {
+                    runCephCommand(master, "I@ceph:${target}", "apt install ${target} -y ")
+                }
+            } else {
+                stage("Upgrade ${target} pkgs on ${minion}") {
+                    runCephCommand(master, "${minion}", "apt install ceph-${target} -y")
+                }
+            }
+            // restart services
+            stage("Restart ${target} services on ${minion}") {
+                runCephCommand(master, "${minion}", "systemctl restart ceph-${target}.target")
+            }
+
+            stage("Verify services for ${minion}") {
+                sleep(10)
+                runCephCommand(master, ADMIN_HOST, "ceph -s")
+            }
+
+            stage('Ask for manual confirmation') {
+                input message: "From the verification command above, please check Ceph ${target} joined the cluster correctly. If so, Do you want to continue to upgrade next node?"
+            }
         }
     }
     runCephCommand(master, ADMIN_HOST, "ceph versions")
@@ -76,6 +136,15 @@
     // create connection to salt master
     python.setupPepperVirtualenv(pepperEnv, SALT_MASTER_URL, SALT_MASTER_CREDENTIALS)
 
+    if (BACKUP_ENABLED.toBoolean() == true) {
+        if (STAGE_UPGRADE_MON.toBoolean() == true) {
+            backup(pepperEnv, 'mon')
+        }
+        if (STAGE_UPGRADE_RGW.toBoolean() == true) {
+            backup(pepperEnv, 'radosgw')
+        }
+    }
+
     if (flags.size() > 0) {
         stage('Set cluster flags') {
             for (flag in flags) {
@@ -86,42 +155,18 @@
 
     if (STAGE_UPGRADE_MON.toBoolean() == true) {
         upgrade(pepperEnv, 'mon')
-        stage("Verify mon services") {
-            runCephCommand(pepperEnv, ADMIN_HOST, "ceph mon stat")
-        }
-        stage('Ask for manual confirmation') {
-            input message: "From the verification command above, please check Ceph mons joined the cluster. If so, Do you want to continue?"
-        }
     }
 
     if (STAGE_UPGRADE_MGR.toBoolean() == true) {
         upgrade(pepperEnv, 'mgr')
-        stage("Verify mgr services") {
-            runCephCommand(pepperEnv, ADMIN_HOST, "ceph -s")
-        }
-        stage('Ask for manual confirmation') {
-            input message: "From the verification command above, please check Ceph mgr joined the cluster. If so, Do you want to continue?"
-        }
     }
 
     if (STAGE_UPGRADE_OSD.toBoolean() == true) {
         upgrade(pepperEnv, 'osd')
-        stage("Verify osd services") {
-            runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd stat")
-        }
-        stage('Ask for manual confirmation') {
-            input message: "From the verification command above, please check Ceph osds joined the cluster. If so, Do you want to continue?"
-        }
     }
 
     if (STAGE_UPGRADE_RGW.toBoolean() == true) {
         upgrade(pepperEnv, 'radosgw')
-        stage("Verify rgw services") {
-            runCephCommand(pepperEnv, ADMIN_HOST, "ceph -s")
-        }
-        stage('Ask for manual confirmation') {
-            input message: "From the verification command above, please check Ceph rgw joined the cluster. If so, Do you want to continue?"
-        }
     }
 
     if (STAGE_UPGRADE_CLIENT.toBoolean() == true) {
@@ -141,27 +186,24 @@
         }
     }
 
-    stage("Finalize ceph version upgrade") {
-        runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd require-osd-release ${TARGET_RELEASE}")
-        try {
-            runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd set-require-min-compat-client ${ORIGIN_RELEASE}")
-        } catch (Exception e) {
-            common.warningMsg(e)
+    if (STAGE_FINALIZE.toBoolean() == true) {
+        stage("Finalize ceph version upgrade") {
+            runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd require-osd-release ${TARGET_RELEASE}")
+            try {
+                runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd set-require-min-compat-client ${ORIGIN_RELEASE}")
+            } catch (Exception e) {
+                common.warningMsg(e)
+            }
+            try {
+                runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd crush tunables optimal")
+            } catch (Exception e) {
+                common.warningMsg(e)
+            }
         }
-        runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd crush tunables optimal")
     }
 
     // wait for healthy cluster
     if (WAIT_FOR_HEALTHY.toBoolean() == true) {
-        stage('Waiting for healthy cluster') {
-            while (true) {
-                def health = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph -s')['return'][0].values()[0]
-                if (health.contains('HEALTH_OK')) {
-                    common.infoMsg('Cluster is healthy')
-                    break;
-                }
-                sleep(10)
-            }
-        }
+        waitForHealthy(pepperEnv)
     }
 }