ceph upgrade / backend migration pipelines

Change-Id: If9d8203bae08c82238e596dd7b746daaf1b94fb9
diff --git a/ceph-add-node.groovy b/ceph-add-node.groovy
index 8757ca1..9da7110 100644
--- a/ceph-add-node.groovy
+++ b/ceph-add-node.groovy
@@ -37,7 +37,7 @@
 
     if (HOST_TYPE.toLowerCase() != 'osd') {
 
-        // enforce crushmap `crushtool -c /etc/ceph/crushmap -o /etc/ceph/crushmap.compiled; ceph osd setcrushmap -i /etc/ceph/crushmap.compiled`
+        // launch VMs
         stage('Launch VMs') {
             salt.enforceState(pepperEnv, 'I@salt:control', 'salt.control', true)
 
diff --git a/ceph-backend-migration.groovy b/ceph-backend-migration.groovy
new file mode 100644
index 0000000..0e9b6b7
--- /dev/null
+++ b/ceph-backend-migration.groovy
@@ -0,0 +1,198 @@
+/**
+ *
+ * Filestore to Bluestore or vice versa backend migration
+ *
+ * Requred parameters:
+ *  SALT_MASTER_URL                 URL of Salt master
+ *  SALT_MASTER_CREDENTIALS         Credentials to the Salt API
+ *
+ *  ADMIN_HOST                      Host (minion id) with admin keyring and /etc/crushmap file present
+ *  OSD                             OSD ids to be migrated if single OSD host is targeted (comma-separated list - 1,2,3)
+ *  TARGET                          Hosts (minion ids) to be targeted
+ *  CLUSTER_FLAGS                   Comma separated list of tags to apply to cluster
+ *  WAIT_FOR_HEALTHY                Wait for cluster rebalance before stoping daemons
+ *  ORIGIN_BACKEND                  Ceph backend before upgrade
+ *
+ */
+
+common = new com.mirantis.mk.Common()
+salt = new com.mirantis.mk.Salt()
+def python = new com.mirantis.mk.Python()
+
+MIGRATION_METHOD = "per-osd"
+// TBD: per-host
+
+def pepperEnv = "pepperEnv"
+def flags = CLUSTER_FLAGS.tokenize(',')
+def osds = OSD.tokenize(',')
+
+def runCephCommand(master, target, cmd) {
+    return salt.cmdRun(master, target, cmd)
+}
+
+node("python") {
+
+    // create connection to salt master
+    python.setupPepperVirtualenv(pepperEnv, SALT_MASTER_URL, SALT_MASTER_CREDENTIALS)
+
+    if (MIGRATION_METHOD == 'per-osd') {
+
+        if (flags.size() > 0) {
+            stage('Set cluster flags') {
+                for (flag in flags) {
+                    runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd set ' + flag)
+                }
+            }
+        }
+
+        def target_hosts = salt.getMinions(pepperEnv, TARGET)
+
+        for (HOST in target_hosts) {
+            def osd_ids = []
+
+            // get list of osd disks of the host
+            def ceph_disks = salt.getGrain(pepperEnv, HOST, 'ceph')['return'][0].values()[0].values()[0]['ceph_disk']
+
+            for (i in ceph_disks) {
+                def osd_id = i.getKey().toString()
+                if (osd_id in osds || OSD == '*') {
+                    osd_ids.add('osd.' + osd_id)
+                    print("Will migrate " + osd_id)
+                } else {
+                    print("Skipping " + osd_id)
+                }
+            }
+
+            for (osd_id in osd_ids) {
+
+                def id = osd_id.replaceAll('osd.', '')
+                def backend = runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd metadata ${id} | grep osd_objectstore")['return'][0].values()[0]
+
+                if (backend.contains(ORIGIN_BACKEND)) {
+
+                    // wait for healthy cluster before manipulating with osds
+                    if (WAIT_FOR_HEALTHY.toBoolean() == true) {
+                        stage('Waiting for healthy cluster') {
+                            while (true) {
+                                def health = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
+                                if (health.contains('HEALTH_OK')) {
+                                    common.infoMsg('Cluster is healthy')
+                                    break;
+                                }
+                                sleep(5)
+                            }
+                        }
+                    }
+
+                    // `ceph osd out <id> <id>`
+                    stage('Set OSDs out') {
+                            runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd out ${osd_id}")
+                    }
+
+                    // wait for healthy cluster
+                    if (WAIT_FOR_HEALTHY.toBoolean() == true) {
+                        stage('Waiting for healthy cluster') {
+                            sleep(5)
+                            while (true) {
+                                def health = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
+                                if (health.contains('HEALTH_OK')) {
+                                    common.infoMsg('Cluster is healthy')
+                                    break;
+                                }
+                                sleep(10)
+                            }
+                        }
+                    }
+
+                    // stop osd daemons
+                    stage('Stop OSD daemons') {
+                        salt.runSaltProcessStep(pepperEnv, HOST, 'service.stop', ['ceph-osd@' + osd_id.replaceAll('osd.', '')],  null, true)
+                    }
+
+                    // remove keyring `ceph auth del osd.3`
+                    stage('Remove OSD keyrings from auth') {
+                        runCephCommand(pepperEnv, ADMIN_HOST, 'ceph auth del ' + osd_id)
+                    }
+
+                    // remove osd `ceph osd rm osd.3`
+                    stage('Remove OSDs') {
+                        runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd rm ' + osd_id)
+                    }
+
+                    def mount = runCephCommand(pepperEnv, HOST, "mount | grep /var/lib/ceph/osd/ceph-${id}")['return'][0].values()[0]
+                    dev = mount.split()[0].replaceAll("[0-9]","")
+
+                    // remove journal or block_db partition `parted /dev/sdj rm 3`
+                    stage('Remove journal / block_db partition') {
+                        def partition_uuid = ""
+                        def journal_partition_uuid = ""
+                        def block_db_partition_uuid = ""
+                        try {
+                            journal_partition_uuid = runCephCommand(pepperEnv, HOST, "ls -la /var/lib/ceph/osd/ceph-${id}/ | grep journal | grep partuuid")
+                            journal_partition_uuid = journal_partition_uuid.toString().trim().split("\n")[0].substring(journal_partition_uuid.toString().trim().lastIndexOf("/")+1)
+                        } catch (Exception e) {
+                            common.infoMsg(e)
+                        }
+                        try {
+                            block_db_partition_uuid = runCephCommand(pepperEnv, HOST, "ls -la /var/lib/ceph/osd/ceph-${id}/ | grep 'block.db' | grep partuuid")
+                            block_db_partition_uuid = block_db_partition_uuid.toString().trim().split("\n")[0].substring(block_db_partition_uuid.toString().trim().lastIndexOf("/")+1)
+                        } catch (Exception e) {
+                            common.infoMsg(e)
+                        }
+
+                        // set partition_uuid = 2c76f144-f412-481e-b150-4046212ca932
+                        if (journal_partition_uuid?.trim()) {
+                            partition_uuid = journal_partition_uuid
+                        } else if (block_db_partition_uuid?.trim()) {
+                            partition_uuid = block_db_partition_uuid
+                        }
+
+                        // if failed disk had block_db or journal on different disk, then remove the partition
+                        if (partition_uuid?.trim()) {
+                            def partition = ""
+                            try {
+                                // partition = /dev/sdi2
+                                partition = runCephCommand(pepperEnv, HOST, "blkid | grep ${partition_uuid} ")['return'][0].values()[0].split("(?<=[0-9])")[0]
+                            } catch (Exception e) {
+                                common.warningMsg(e)
+                            }
+
+                            if (partition?.trim()) {
+                                // dev = /dev/sdi
+                                def dev = partition.replaceAll("[0-9]", "")
+                                // part_id = 2
+                                def part_id = partition.substring(partition.lastIndexOf("/")+1).replaceAll("[^0-9]", "")
+                                runCephCommand(pepperEnv, HOST, "parted ${dev} rm ${part_id}")
+                            }
+                        }
+                    }
+
+                    // umount `umount /dev/sdi1`
+                    stage('Umount devices') {
+                        runCephCommand(pepperEnv, HOST, "umount /var/lib/ceph/osd/ceph-${id}")
+                    }
+
+                    // zap disks `ceph-disk zap /dev/sdi`
+                    stage('Zap device') {
+                        runCephCommand(pepperEnv, HOST, 'ceph-disk zap ' + dev)
+                    }
+
+                    // Deploy failed Ceph OSD
+                    stage('Deploy Ceph OSD') {
+                        salt.runSaltProcessStep(pepperEnv, HOST, 'saltutil.refresh_pillar', [], null, true, 5)
+                        salt.enforceState(pepperEnv, HOST, 'ceph.osd', true)
+                    }
+                }
+            }
+        }
+        // remove cluster flags
+        if (flags.size() > 0) {
+            stage('Unset cluster flags') {
+                for (flag in flags) {
+                    common.infoMsg('Removing flag ' + flag)
+                    runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd unset ' + flag)
+                }
+            }
+        }
+    }
+}
diff --git a/ceph-remove-node.groovy b/ceph-remove-node.groovy
index 21671bf..bc0b09d 100644
--- a/ceph-remove-node.groovy
+++ b/ceph-remove-node.groovy
@@ -92,6 +92,7 @@
         // wait for healthy cluster
         if (WAIT_FOR_HEALTHY.toBoolean() == true) {
             stage('Waiting for healthy cluster') {
+                sleep(5)
                 while (true) {
                     def health = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
                     if (health.contains('HEALTH_OK')) {
diff --git a/ceph-remove-osd.groovy b/ceph-remove-osd.groovy
index 18c5525..c5610eb 100644
--- a/ceph-remove-osd.groovy
+++ b/ceph-remove-osd.groovy
@@ -60,12 +60,13 @@
 
     // `ceph osd out <id> <id>`
     stage('Set OSDs out') {
-            runCephCommand(pepperEnv, 'ceph osd out ' + osd_ids.join(' '))
+        runCephCommand(pepperEnv, 'ceph osd out ' + osd_ids.join(' '))
     }
 
     // wait for healthy cluster
     if (WAIT_FOR_HEALTHY.toBoolean() == true) {
         stage('Waiting for healthy cluster') {
+            sleep(5)
             while (true) {
                 def health = runCephCommand(pepperEnv, 'ceph health')['return'][0].values()[0]
                 if (health.contains('HEALTH_OK')) {
diff --git a/ceph-replace-failed-osd.groovy b/ceph-replace-failed-osd.groovy
index 6c6f281..ee4ef38 100644
--- a/ceph-replace-failed-osd.groovy
+++ b/ceph-replace-failed-osd.groovy
@@ -11,8 +11,6 @@
  *  OSD                             Failed OSD ids to be replaced (comma-separated list - 1,2,3)
  *  DEVICE                          Comma separated list of failed devices that will be replaced at HOST (/dev/sdb,/dev/sdc)
  *  JOURNAL_OR_BLOCKDB_PARTITION    Comma separated list of partitions where journal or block_db for the failed devices on this HOST were stored (/dev/sdh2,/dev/sdh3)
- *  ENFORCE_CRUSHMAP                Set to true if the prepared crush map should be enforced
- *  WAIT_FOR_PG_REBALANCE           Wait for PGs to rebalance after osd is removed from crush map
  *  CLUSTER_FLAGS                   Comma separated list of tags to apply to cluster
  *  WAIT_FOR_HEALTHY                Wait for cluster rebalance before stoping daemons
  *
@@ -67,12 +65,13 @@
 
     // `ceph osd out <id> <id>`
     stage('Set OSDs out') {
-            runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd out ' + osd_ids.join(' '))
+        runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd out ' + osd_ids.join(' '))
     }
 
     // wait for healthy cluster
     if (WAIT_FOR_HEALTHY.toBoolean() == true) {
         stage('Waiting for healthy cluster') {
+            sleep(5)
             while (true) {
                 def health = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
                 if (health.contains('HEALTH_OK')) {
@@ -90,7 +89,7 @@
             salt.runSaltProcessStep(pepperEnv, HOST, 'service.stop', ['ceph-osd@' + i.replaceAll('osd.', '')],  null, true)
         }
     }
-
+    /*
     // `ceph osd crush remove osd.2`
     stage('Remove OSDs from CRUSH') {
         for (i in osd_ids) {
@@ -111,7 +110,7 @@
             }
         }
     }
-
+    */
     // remove keyring `ceph auth del osd.3`
     stage('Remove OSD keyrings from auth') {
         for (i in osd_ids) {
@@ -126,16 +125,6 @@
         }
     }
 
-    // remove cluster flags
-    if (flags.size() > 0) {
-        stage('Unset cluster flags') {
-            for (flag in flags) {
-                common.infoMsg('Removing flag ' + flag)
-                runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd unset ' + flag)
-            }
-        }
-    }
-
     // umount `umount /dev/sdi1`
     stage('Umount devices') {
         for (dev in devices) {
@@ -168,7 +157,17 @@
         salt.enforceState(pepperEnv, HOST, 'ceph.osd', true)
     }
 
+    // remove cluster flags
+    if (flags.size() > 0) {
+        stage('Unset cluster flags') {
+            for (flag in flags) {
+                common.infoMsg('Removing flag ' + flag)
+                runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd unset ' + flag)
+            }
+        }
+    }
 
+    /*
     if (ENFORCE_CRUSHMAP.toBoolean() == true) {
 
         // enforce crushmap `crushtool -c /etc/ceph/crushmap -o /etc/ceph/crushmap.compiled; ceph osd setcrushmap -i /etc/ceph/crushmap.compiled`
@@ -181,4 +180,5 @@
             runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd setcrushmap -i /etc/ceph/crushmap.compiled')
         }
     }
+    */
 }
diff --git a/ceph-upgrade.groovy b/ceph-upgrade.groovy
new file mode 100644
index 0000000..db6bea3
--- /dev/null
+++ b/ceph-upgrade.groovy
@@ -0,0 +1,167 @@
+/**
+ *
+ * Upgrade Ceph mon/mgr/osd/rgw/client
+ *
+ * Requred parameters:
+ *  SALT_MASTER_URL                 URL of Salt master
+ *  SALT_MASTER_CREDENTIALS         Credentials to the Salt API
+ *
+ *  ADMIN_HOST                      Host (minion id) with admin keyring and /etc/crushmap file present
+ *  CLUSTER_FLAGS                   Comma separated list of tags to apply to cluster
+ *  WAIT_FOR_HEALTHY                Wait for cluster rebalance before stoping daemons
+ *  ORIGIN_RELEASE                  Ceph release version before upgrade
+ *  TARGET_RELEASE                  Ceph release version after upgrade
+ *  STAGE_UPGRADE_MON               Set to True if Ceph mon nodes upgrade is desired
+ *  STAGE_UPGRADE_MGR               Set to True if Ceph mgr nodes upgrade or new deploy is desired
+ *  STAGE_UPGRADE_OSD               Set to True if Ceph osd nodes upgrade is desired
+ *  STAGE_UPGRADE_RGW               Set to True if Ceph rgw nodes upgrade is desired
+ *  STAGE_UPGRADE_CLIENT            Set to True if Ceph client nodes upgrade is desired (includes for example ctl/cmp nodes)
+ *
+ */
+
+common = new com.mirantis.mk.Common()
+salt = new com.mirantis.mk.Salt()
+def python = new com.mirantis.mk.Python()
+
+def pepperEnv = "pepperEnv"
+def flags = CLUSTER_FLAGS.tokenize(',')
+
+def runCephCommand(master, target, cmd) {
+    return salt.cmdRun(master, target, cmd)
+}
+
+def upgrade(master, target) {
+
+    stage("Change ${target} repos") {
+        salt.runSaltProcessStep(master, "I@ceph:${target}", 'saltutil.refresh_pillar', [], null, true, 5)
+        salt.enforceState(master, "I@ceph:${target}", 'linux.system.repo', true)
+    }
+
+    if (target == 'mgr') {
+        stage('Run ceph mgr state') {
+            salt.enforceState(master, "I@ceph:mgr", "ceph.mgr", true)
+        }
+    }
+
+    if (target == 'common') {
+        stage('Upgrade ceph-common pkgs') {
+            runCephCommand(master, "I@ceph:${target}", "apt install ceph-${target} -y ")
+        }
+    } else if (target == 'radosgw') {
+        stage('Upgrade radosgw pkgs') {
+            runCephCommand(master, "I@ceph:${target}", "apt install ${target} -y ")
+        }
+        // restart services
+        stage("Restart ${target} services") {
+            runCephCommand(master, "I@ceph:${target}", "systemctl restart ceph-${target}.target")
+        }
+    } else {
+
+        // upgrade pkgs
+        stage("Upgrade ${target} pkgs") {
+            runCephCommand(master, "I@ceph:${target}", "apt install ceph-${target} -y ")
+        }
+        // restart services
+        stage("Restart ${target} services") {
+            runCephCommand(master, "I@ceph:${target}", "systemctl restart ceph-${target}.target")
+        }
+    }
+    runCephCommand(master, ADMIN_HOST, "ceph versions")
+    sleep(5)
+    return
+}
+
+node("python") {
+
+    // create connection to salt master
+    python.setupPepperVirtualenv(pepperEnv, SALT_MASTER_URL, SALT_MASTER_CREDENTIALS)
+
+    if (flags.size() > 0) {
+        stage('Set cluster flags') {
+            for (flag in flags) {
+                runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd set ' + flag)
+            }
+        }
+    }
+
+    if (STAGE_UPGRADE_MON.toBoolean() == true) {
+        upgrade(pepperEnv, 'mon')
+        stage("Verify mon services") {
+            runCephCommand(pepperEnv, ADMIN_HOST, "ceph mon stat")
+        }
+        stage('Ask for manual confirmation') {
+            input message: "From the verification command above, please check Ceph mons joined the cluster. If so, Do you want to continue?"
+        }
+    }
+
+    if (STAGE_UPGRADE_MGR.toBoolean() == true) {
+        upgrade(pepperEnv, 'mgr')
+        stage("Verify mgr services") {
+            runCephCommand(pepperEnv, ADMIN_HOST, "ceph -s")
+        }
+        stage('Ask for manual confirmation') {
+            input message: "From the verification command above, please check Ceph mgr joined the cluster. If so, Do you want to continue?"
+        }
+    }
+
+    if (STAGE_UPGRADE_OSD.toBoolean() == true) {
+        upgrade(pepperEnv, 'osd')
+        stage("Verify osd services") {
+            runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd stat")
+        }
+        stage('Ask for manual confirmation') {
+            input message: "From the verification command above, please check Ceph osds joined the cluster. If so, Do you want to continue?"
+        }
+    }
+
+    if (STAGE_UPGRADE_RGW.toBoolean() == true) {
+        upgrade(pepperEnv, 'radosgw')
+        stage("Verify rgw services") {
+            runCephCommand(pepperEnv, ADMIN_HOST, "ceph -s")
+        }
+        stage('Ask for manual confirmation') {
+            input message: "From the verification command above, please check Ceph rgw joined the cluster. If so, Do you want to continue?"
+        }
+    }
+
+    if (STAGE_UPGRADE_CLIENT.toBoolean() == true) {
+        upgrade(pepperEnv, 'common')
+    }
+
+    // remove cluster flags
+    if (flags.size() > 0) {
+        stage('Unset cluster flags') {
+            for (flag in flags) {
+                if (!flag.contains('sortbitwise')) {
+                    common.infoMsg('Removing flag ' + flag)
+                    runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd unset ' + flag)
+                }
+
+            }
+        }
+    }
+
+    stage("Finalize ceph version upgrade") {
+        runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd require-osd-release ${TARGET_RELEASE}")
+        try {
+            runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd set-require-min-compat-client ${ORIGIN_RELEASE}")
+        } catch (Exception e) {
+            common.warningMsg(e)
+        }
+        runCephCommand(pepperEnv, ADMIN_HOST, "ceph osd crush tunables optimal")
+    }
+
+    // wait for healthy cluster
+    if (WAIT_FOR_HEALTHY.toBoolean() == true) {
+        stage('Waiting for healthy cluster') {
+            while (true) {
+                def health = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph -s')['return'][0].values()[0]
+                if (health.contains('HEALTH_OK')) {
+                    common.infoMsg('Cluster is healthy')
+                    break;
+                }
+                sleep(10)
+            }
+        }
+    }
+}