Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 1 | /** |
| 2 | * |
| 3 | * Replace failed disk with a new disk |
| 4 | * |
| 5 | * Requred parameters: |
Jiri Broulik | a657d56 | 2017-11-28 14:19:32 +0100 | [diff] [blame] | 6 | * SALT_MASTER_URL URL of Salt master |
| 7 | * SALT_MASTER_CREDENTIALS Credentials to the Salt API |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 8 | * |
Jiri Broulik | a657d56 | 2017-11-28 14:19:32 +0100 | [diff] [blame] | 9 | * HOST Host (minion id) to be removed |
| 10 | * ADMIN_HOST Host (minion id) with admin keyring and /etc/crushmap file present |
| 11 | * OSD Failed OSD ids to be replaced (comma-separated list - 1,2,3) |
| 12 | * DEVICE Comma separated list of failed devices that will be replaced at HOST (/dev/sdb,/dev/sdc) |
| 13 | * JOURNAL_BLOCKDB_BLOCKWAL_PARTITION Comma separated list of partitions where journal or block_db or block_wal for the failed devices on this HOST were stored (/dev/sdh2,/dev/sdh3) |
Jiri Broulik | 86c50d0 | 2018-07-11 14:48:19 +0200 | [diff] [blame] | 14 | * DATA_PARTITION Comma separated list of mounted partitions of failed device. These partitions will be unmounted. For ex. /dev/sdb1,/dev/sdb3 |
Jiri Broulik | a657d56 | 2017-11-28 14:19:32 +0100 | [diff] [blame] | 15 | * CLUSTER_FLAGS Comma separated list of tags to apply to cluster |
| 16 | * WAIT_FOR_HEALTHY Wait for cluster rebalance before stoping daemons |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 17 | * DMCRYPT Set to True if replacing osds are/were encrypted |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 18 | * |
| 19 | */ |
| 20 | |
| 21 | common = new com.mirantis.mk.Common() |
| 22 | salt = new com.mirantis.mk.Salt() |
| 23 | def python = new com.mirantis.mk.Python() |
| 24 | |
| 25 | def pepperEnv = "pepperEnv" |
| 26 | def flags = CLUSTER_FLAGS.tokenize(',') |
| 27 | def osds = OSD.tokenize(',') |
| 28 | def devices = DEVICE.tokenize(',') |
Jiri Broulik | a657d56 | 2017-11-28 14:19:32 +0100 | [diff] [blame] | 29 | def journals_blockdbs_blockwals = JOURNAL_BLOCKDB_BLOCKWAL_PARTITION.tokenize(',') |
Jiri Broulik | 86c50d0 | 2018-07-11 14:48:19 +0200 | [diff] [blame] | 30 | def mounted_partitions = DATA_PARTITION.tokenize(',') |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 31 | |
| 32 | |
| 33 | def runCephCommand(master, target, cmd) { |
| 34 | return salt.cmdRun(master, target, cmd) |
| 35 | } |
| 36 | |
Jiri Broulik | 96c867a | 2017-11-07 16:14:10 +0100 | [diff] [blame] | 37 | def waitForHealthy(master, count=0, attempts=300) { |
| 38 | // wait for healthy cluster |
| 39 | while (count<attempts) { |
| 40 | def health = runCephCommand(master, ADMIN_HOST, 'ceph health')['return'][0].values()[0] |
| 41 | if (health.contains('HEALTH_OK')) { |
| 42 | common.infoMsg('Cluster is healthy') |
| 43 | break; |
| 44 | } |
| 45 | count++ |
| 46 | sleep(10) |
| 47 | } |
| 48 | } |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 49 | timeout(time: 12, unit: 'HOURS') { |
| 50 | node("python") { |
Jiri Broulik | 96c867a | 2017-11-07 16:14:10 +0100 | [diff] [blame] | 51 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 52 | // create connection to salt master |
| 53 | python.setupPepperVirtualenv(pepperEnv, SALT_MASTER_URL, SALT_MASTER_CREDENTIALS) |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 54 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 55 | def osd_ids = [] |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 56 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 57 | for (osd_id in osds) { |
| 58 | osd_ids.add('osd.' + osd_id) |
| 59 | print("Will delete " + osd_id) |
| 60 | } |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 61 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 62 | // `ceph osd out <id> <id>` |
| 63 | stage('Set OSDs out') { |
| 64 | runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd out ' + osd_ids.join(' ')) |
| 65 | } |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 66 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 67 | // wait for healthy cluster |
| 68 | if (WAIT_FOR_HEALTHY.toBoolean() == true) { |
| 69 | sleep(5) |
| 70 | waitForHealthy(pepperEnv) |
| 71 | } |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 72 | |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 73 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 74 | if (flags.size() > 0) { |
| 75 | stage('Set cluster flags') { |
| 76 | for (flag in flags) { |
| 77 | runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd set ' + flag) |
| 78 | } |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 79 | } |
| 80 | } |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 81 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 82 | // stop osd daemons |
| 83 | stage('Stop OSD daemons') { |
| 84 | for (i in osd_ids) { |
| 85 | salt.runSaltProcessStep(pepperEnv, HOST, 'service.stop', ['ceph-osd@' + i.replaceAll('osd.', '')], null, true) |
| 86 | } |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 87 | } |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 88 | /* |
| 89 | // `ceph osd crush remove osd.2` |
| 90 | stage('Remove OSDs from CRUSH') { |
| 91 | for (i in osd_ids) { |
| 92 | runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd crush remove ' + i) |
| 93 | } |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 94 | } |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 95 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 96 | // wait for pgs to rebalance |
| 97 | if (WAIT_FOR_PG_REBALANCE.toBoolean() == true) { |
| 98 | stage('Waiting for pgs to rebalance') { |
| 99 | while (true) { |
| 100 | def status = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph -s')['return'][0].values()[0] |
| 101 | if (!status.contains('degraded')) { |
| 102 | common.infoMsg('PGs rebalanced') |
| 103 | break; |
| 104 | } |
| 105 | sleep(10) |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 106 | } |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 107 | } |
| 108 | } |
| 109 | */ |
| 110 | // remove keyring `ceph auth del osd.3` |
| 111 | stage('Remove OSD keyrings from auth') { |
| 112 | for (i in osd_ids) { |
| 113 | runCephCommand(pepperEnv, ADMIN_HOST, 'ceph auth del ' + i) |
| 114 | } |
| 115 | } |
| 116 | |
| 117 | // remove osd `ceph osd rm osd.3` |
| 118 | stage('Remove OSDs') { |
| 119 | for (i in osd_ids) { |
| 120 | runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd rm ' + i) |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | if (DMCRYPT.toBoolean() == true) { |
| 125 | |
| 126 | // remove partition tables |
Jiri Broulik | a5bc8f6 | 2018-01-31 15:04:40 +0100 | [diff] [blame] | 127 | stage('dd / zap device') { |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 128 | for (dev in devices) { |
Jiri Broulik | a5bc8f6 | 2018-01-31 15:04:40 +0100 | [diff] [blame] | 129 | runCephCommand(pepperEnv, HOST, "dd if=/dev/zero of=${dev} bs=4096k count=1 conv=notrunc") |
| 130 | try { |
| 131 | runCephCommand(pepperEnv, HOST, "sgdisk --zap-all --clear --mbrtogpt -g -- ${dev}") |
| 132 | } catch (Exception e) { |
| 133 | common.warningMsg(e) |
| 134 | } |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 135 | } |
| 136 | } |
| 137 | |
| 138 | // remove journal, block_db or block_wal partition `parted /dev/sdj rm 3` |
| 139 | stage('Remove journal / block_db / block_wal partitions') { |
| 140 | for (partition in journals_blockdbs_blockwals) { |
| 141 | if (partition?.trim()) { |
| 142 | // dev = /dev/sdi |
| 143 | def dev = partition.replaceAll("[0-9]", "") |
| 144 | // part_id = 2 |
Jiri Broulik | a5bc8f6 | 2018-01-31 15:04:40 +0100 | [diff] [blame] | 145 | def part_id = partition.substring(partition.lastIndexOf("/")+1).replaceAll("[^0-9]+", "") |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 146 | try { |
| 147 | runCephCommand(pepperEnv, HOST, "Ignore | parted ${dev} rm ${part_id}") |
| 148 | } catch (Exception e) { |
| 149 | common.warningMsg(e) |
| 150 | } |
| 151 | } |
| 152 | } |
| 153 | } |
| 154 | |
| 155 | // reboot |
| 156 | stage('reboot and wait') { |
| 157 | salt.runSaltProcessStep(pepperEnv, HOST, 'system.reboot', null, null, true, 5) |
| 158 | salt.minionsReachable(pepperEnv, 'I@salt:master', HOST) |
Jiri Broulik | 99887c8 | 2017-10-31 09:27:52 +0100 | [diff] [blame] | 159 | sleep(10) |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 160 | } |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 161 | |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 162 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 163 | // zap disks `ceph-disk zap /dev/sdi` |
| 164 | stage('Zap devices') { |
| 165 | for (dev in devices) { |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 166 | try { |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 167 | runCephCommand(pepperEnv, HOST, 'ceph-disk zap ' + dev) |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 168 | } catch (Exception e) { |
| 169 | common.warningMsg(e) |
| 170 | } |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 171 | runCephCommand(pepperEnv, HOST, 'ceph-disk zap ' + dev) |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 172 | } |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 173 | } |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 174 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 175 | } else { |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 176 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 177 | // umount `umount /dev/sdi1` |
Jiri Broulik | 86c50d0 | 2018-07-11 14:48:19 +0200 | [diff] [blame] | 178 | stage('Umount partitions') { |
| 179 | if (mounted_partitions == null || mounted_partitions.empty) { |
| 180 | for (dev in devices) { |
| 181 | try { |
| 182 | runCephCommand(pepperEnv, HOST, 'umount ' + dev + '1') |
| 183 | } catch (Exception e) { |
| 184 | common.warningMsg(e) |
| 185 | } |
| 186 | } |
| 187 | } else { |
| 188 | for (part in mounted_partitions) { |
| 189 | try { |
| 190 | runCephCommand(pepperEnv, HOST, 'umount ' + part) |
| 191 | } catch (Exception e) { |
| 192 | common.warningMsg(e) |
| 193 | } |
| 194 | } |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 195 | } |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 196 | } |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 197 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 198 | // zap disks `ceph-disk zap /dev/sdi` |
| 199 | stage('Zap devices') { |
| 200 | for (dev in devices) { |
| 201 | runCephCommand(pepperEnv, HOST, 'ceph-disk zap ' + dev) |
| 202 | } |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 203 | } |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 204 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 205 | // remove journal, block_db or block_wal partition `parted /dev/sdj rm 3` |
| 206 | stage('Remove journal / block_db / block_wal partitions') { |
| 207 | for (partition in journals_blockdbs_blockwals) { |
| 208 | if (partition?.trim()) { |
| 209 | // dev = /dev/sdi |
| 210 | def dev = partition.replaceAll("[0-9]", "") |
| 211 | // part_id = 2 |
| 212 | def part_id = partition.substring(partition.lastIndexOf("/")+1).replaceAll("[^0-9]", "") |
| 213 | try { |
| 214 | runCephCommand(pepperEnv, HOST, "parted ${dev} rm ${part_id}") |
| 215 | } catch (Exception e) { |
| 216 | common.warningMsg(e) |
| 217 | } |
Jiri Broulik | eb7b82f | 2017-11-30 13:55:40 +0100 | [diff] [blame] | 218 | } |
| 219 | } |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 220 | } |
| 221 | } |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 222 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 223 | // Deploy failed Ceph OSD |
| 224 | stage('Deploy Ceph OSD') { |
| 225 | salt.enforceState(pepperEnv, HOST, 'ceph.osd', true) |
| 226 | } |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 227 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 228 | // remove cluster flags |
| 229 | if (flags.size() > 0) { |
| 230 | stage('Unset cluster flags') { |
| 231 | for (flag in flags) { |
| 232 | common.infoMsg('Removing flag ' + flag) |
| 233 | runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd unset ' + flag) |
| 234 | } |
Jiri Broulik | dc87d72 | 2017-11-03 15:43:22 +0100 | [diff] [blame] | 235 | } |
| 236 | } |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 237 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 238 | /* |
| 239 | if (ENFORCE_CRUSHMAP.toBoolean() == true) { |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 240 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 241 | // enforce crushmap `crushtool -c /etc/ceph/crushmap -o /etc/ceph/crushmap.compiled; ceph osd setcrushmap -i /etc/ceph/crushmap.compiled` |
| 242 | stage('Enforce crushmap') { |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 243 | |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 244 | stage('Ask for manual confirmation') { |
| 245 | input message: "Are you sure that your ADMIN_HOST has correct /etc/ceph/crushmap file? Click proceed to compile and enforce crushmap." |
| 246 | } |
| 247 | runCephCommand(pepperEnv, ADMIN_HOST, 'crushtool -c /etc/ceph/crushmap -o /etc/ceph/crushmap.compiled') |
| 248 | runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd setcrushmap -i /etc/ceph/crushmap.compiled') |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 249 | } |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 250 | } |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 251 | */ |
Jiri Broulik | 2c00f4c | 2017-10-26 13:23:11 +0200 | [diff] [blame] | 252 | } |
Jakub Josef | a63f986 | 2018-01-11 17:58:38 +0100 | [diff] [blame] | 253 | } |