blob: 601d74dce288d6684b6f320c5cf88dc1b222aea2 [file] [log] [blame]
Jiri Broulik2c00f4c2017-10-26 13:23:11 +02001/**
2 *
3 * Replace failed disk with a new disk
4 *
5 * Requred parameters:
Jiri Broulika657d562017-11-28 14:19:32 +01006 * SALT_MASTER_URL URL of Salt master
7 * SALT_MASTER_CREDENTIALS Credentials to the Salt API
Jiri Broulik2c00f4c2017-10-26 13:23:11 +02008 *
Jiri Broulika657d562017-11-28 14:19:32 +01009 * HOST Host (minion id) to be removed
10 * ADMIN_HOST Host (minion id) with admin keyring and /etc/crushmap file present
11 * OSD Failed OSD ids to be replaced (comma-separated list - 1,2,3)
12 * DEVICE Comma separated list of failed devices that will be replaced at HOST (/dev/sdb,/dev/sdc)
13 * JOURNAL_BLOCKDB_BLOCKWAL_PARTITION Comma separated list of partitions where journal or block_db or block_wal for the failed devices on this HOST were stored (/dev/sdh2,/dev/sdh3)
Jiri Broulik86c50d02018-07-11 14:48:19 +020014 * DATA_PARTITION Comma separated list of mounted partitions of failed device. These partitions will be unmounted. For ex. /dev/sdb1,/dev/sdb3
Jiri Broulika657d562017-11-28 14:19:32 +010015 * CLUSTER_FLAGS Comma separated list of tags to apply to cluster
16 * WAIT_FOR_HEALTHY Wait for cluster rebalance before stoping daemons
Jiri Broulikeb7b82f2017-11-30 13:55:40 +010017 * DMCRYPT Set to True if replacing osds are/were encrypted
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020018 *
19 */
20
21common = new com.mirantis.mk.Common()
22salt = new com.mirantis.mk.Salt()
23def python = new com.mirantis.mk.Python()
24
25def pepperEnv = "pepperEnv"
26def flags = CLUSTER_FLAGS.tokenize(',')
27def osds = OSD.tokenize(',')
28def devices = DEVICE.tokenize(',')
Jiri Broulika657d562017-11-28 14:19:32 +010029def journals_blockdbs_blockwals = JOURNAL_BLOCKDB_BLOCKWAL_PARTITION.tokenize(',')
Jiri Broulik86c50d02018-07-11 14:48:19 +020030def mounted_partitions = DATA_PARTITION.tokenize(',')
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020031
32
33def runCephCommand(master, target, cmd) {
34 return salt.cmdRun(master, target, cmd)
35}
36
Jiri Broulik96c867a2017-11-07 16:14:10 +010037def waitForHealthy(master, count=0, attempts=300) {
38 // wait for healthy cluster
39 while (count<attempts) {
40 def health = runCephCommand(master, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
41 if (health.contains('HEALTH_OK')) {
42 common.infoMsg('Cluster is healthy')
43 break;
44 }
45 count++
46 sleep(10)
47 }
48}
Jakub Josefa63f9862018-01-11 17:58:38 +010049timeout(time: 12, unit: 'HOURS') {
50 node("python") {
Jiri Broulik96c867a2017-11-07 16:14:10 +010051
Jakub Josefa63f9862018-01-11 17:58:38 +010052 // create connection to salt master
53 python.setupPepperVirtualenv(pepperEnv, SALT_MASTER_URL, SALT_MASTER_CREDENTIALS)
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020054
Jakub Josefa63f9862018-01-11 17:58:38 +010055 def osd_ids = []
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020056
Jakub Josefa63f9862018-01-11 17:58:38 +010057 for (osd_id in osds) {
58 osd_ids.add('osd.' + osd_id)
59 print("Will delete " + osd_id)
60 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020061
Jakub Josefa63f9862018-01-11 17:58:38 +010062 // `ceph osd out <id> <id>`
63 stage('Set OSDs out') {
64 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd out ' + osd_ids.join(' '))
65 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020066
Jakub Josefa63f9862018-01-11 17:58:38 +010067 // wait for healthy cluster
68 if (WAIT_FOR_HEALTHY.toBoolean() == true) {
69 sleep(5)
70 waitForHealthy(pepperEnv)
71 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020072
Jiri Broulikeb7b82f2017-11-30 13:55:40 +010073
Jakub Josefa63f9862018-01-11 17:58:38 +010074 if (flags.size() > 0) {
75 stage('Set cluster flags') {
76 for (flag in flags) {
77 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd set ' + flag)
78 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +010079 }
80 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +010081
Jakub Josefa63f9862018-01-11 17:58:38 +010082 // stop osd daemons
83 stage('Stop OSD daemons') {
84 for (i in osd_ids) {
85 salt.runSaltProcessStep(pepperEnv, HOST, 'service.stop', ['ceph-osd@' + i.replaceAll('osd.', '')], null, true)
86 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020087 }
Jakub Josefa63f9862018-01-11 17:58:38 +010088 /*
89 // `ceph osd crush remove osd.2`
90 stage('Remove OSDs from CRUSH') {
91 for (i in osd_ids) {
92 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd crush remove ' + i)
93 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020094 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020095
Jakub Josefa63f9862018-01-11 17:58:38 +010096 // wait for pgs to rebalance
97 if (WAIT_FOR_PG_REBALANCE.toBoolean() == true) {
98 stage('Waiting for pgs to rebalance') {
99 while (true) {
100 def status = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph -s')['return'][0].values()[0]
101 if (!status.contains('degraded')) {
102 common.infoMsg('PGs rebalanced')
103 break;
104 }
105 sleep(10)
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200106 }
Jakub Josefa63f9862018-01-11 17:58:38 +0100107 }
108 }
109 */
110 // remove keyring `ceph auth del osd.3`
111 stage('Remove OSD keyrings from auth') {
112 for (i in osd_ids) {
113 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph auth del ' + i)
114 }
115 }
116
117 // remove osd `ceph osd rm osd.3`
118 stage('Remove OSDs') {
119 for (i in osd_ids) {
120 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd rm ' + i)
121 }
122 }
123
124 if (DMCRYPT.toBoolean() == true) {
125
126 // remove partition tables
Jiri Broulika5bc8f62018-01-31 15:04:40 +0100127 stage('dd / zap device') {
Jakub Josefa63f9862018-01-11 17:58:38 +0100128 for (dev in devices) {
Jiri Broulika5bc8f62018-01-31 15:04:40 +0100129 runCephCommand(pepperEnv, HOST, "dd if=/dev/zero of=${dev} bs=4096k count=1 conv=notrunc")
130 try {
131 runCephCommand(pepperEnv, HOST, "sgdisk --zap-all --clear --mbrtogpt -g -- ${dev}")
132 } catch (Exception e) {
133 common.warningMsg(e)
134 }
Jakub Josefa63f9862018-01-11 17:58:38 +0100135 }
136 }
137
138 // remove journal, block_db or block_wal partition `parted /dev/sdj rm 3`
139 stage('Remove journal / block_db / block_wal partitions') {
140 for (partition in journals_blockdbs_blockwals) {
141 if (partition?.trim()) {
142 // dev = /dev/sdi
143 def dev = partition.replaceAll("[0-9]", "")
144 // part_id = 2
Jiri Broulika5bc8f62018-01-31 15:04:40 +0100145 def part_id = partition.substring(partition.lastIndexOf("/")+1).replaceAll("[^0-9]+", "")
Jakub Josefa63f9862018-01-11 17:58:38 +0100146 try {
147 runCephCommand(pepperEnv, HOST, "Ignore | parted ${dev} rm ${part_id}")
148 } catch (Exception e) {
149 common.warningMsg(e)
150 }
151 }
152 }
153 }
154
155 // reboot
156 stage('reboot and wait') {
157 salt.runSaltProcessStep(pepperEnv, HOST, 'system.reboot', null, null, true, 5)
158 salt.minionsReachable(pepperEnv, 'I@salt:master', HOST)
Jiri Broulik99887c82017-10-31 09:27:52 +0100159 sleep(10)
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200160 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200161
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200162
Jakub Josefa63f9862018-01-11 17:58:38 +0100163 // zap disks `ceph-disk zap /dev/sdi`
164 stage('Zap devices') {
165 for (dev in devices) {
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100166 try {
Jakub Josefa63f9862018-01-11 17:58:38 +0100167 runCephCommand(pepperEnv, HOST, 'ceph-disk zap ' + dev)
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100168 } catch (Exception e) {
169 common.warningMsg(e)
170 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100171 runCephCommand(pepperEnv, HOST, 'ceph-disk zap ' + dev)
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100172 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100173 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100174
Jakub Josefa63f9862018-01-11 17:58:38 +0100175 } else {
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100176
Jakub Josefa63f9862018-01-11 17:58:38 +0100177 // umount `umount /dev/sdi1`
Jiri Broulik86c50d02018-07-11 14:48:19 +0200178 stage('Umount partitions') {
179 if (mounted_partitions == null || mounted_partitions.empty) {
180 for (dev in devices) {
181 try {
182 runCephCommand(pepperEnv, HOST, 'umount ' + dev + '1')
183 } catch (Exception e) {
184 common.warningMsg(e)
185 }
186 }
187 } else {
188 for (part in mounted_partitions) {
189 try {
190 runCephCommand(pepperEnv, HOST, 'umount ' + part)
191 } catch (Exception e) {
192 common.warningMsg(e)
193 }
194 }
Jakub Josefa63f9862018-01-11 17:58:38 +0100195 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100196 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100197
Jakub Josefa63f9862018-01-11 17:58:38 +0100198 // zap disks `ceph-disk zap /dev/sdi`
199 stage('Zap devices') {
200 for (dev in devices) {
201 runCephCommand(pepperEnv, HOST, 'ceph-disk zap ' + dev)
202 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100203 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100204
Jakub Josefa63f9862018-01-11 17:58:38 +0100205 // remove journal, block_db or block_wal partition `parted /dev/sdj rm 3`
206 stage('Remove journal / block_db / block_wal partitions') {
207 for (partition in journals_blockdbs_blockwals) {
208 if (partition?.trim()) {
209 // dev = /dev/sdi
210 def dev = partition.replaceAll("[0-9]", "")
211 // part_id = 2
212 def part_id = partition.substring(partition.lastIndexOf("/")+1).replaceAll("[^0-9]", "")
213 try {
214 runCephCommand(pepperEnv, HOST, "parted ${dev} rm ${part_id}")
215 } catch (Exception e) {
216 common.warningMsg(e)
217 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100218 }
219 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200220 }
221 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200222
Jakub Josefa63f9862018-01-11 17:58:38 +0100223 // Deploy failed Ceph OSD
224 stage('Deploy Ceph OSD') {
225 salt.enforceState(pepperEnv, HOST, 'ceph.osd', true)
226 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200227
Jakub Josefa63f9862018-01-11 17:58:38 +0100228 // remove cluster flags
229 if (flags.size() > 0) {
230 stage('Unset cluster flags') {
231 for (flag in flags) {
232 common.infoMsg('Removing flag ' + flag)
233 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd unset ' + flag)
234 }
Jiri Broulikdc87d722017-11-03 15:43:22 +0100235 }
236 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200237
Jakub Josefa63f9862018-01-11 17:58:38 +0100238 /*
239 if (ENFORCE_CRUSHMAP.toBoolean() == true) {
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200240
Jakub Josefa63f9862018-01-11 17:58:38 +0100241 // enforce crushmap `crushtool -c /etc/ceph/crushmap -o /etc/ceph/crushmap.compiled; ceph osd setcrushmap -i /etc/ceph/crushmap.compiled`
242 stage('Enforce crushmap') {
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200243
Jakub Josefa63f9862018-01-11 17:58:38 +0100244 stage('Ask for manual confirmation') {
245 input message: "Are you sure that your ADMIN_HOST has correct /etc/ceph/crushmap file? Click proceed to compile and enforce crushmap."
246 }
247 runCephCommand(pepperEnv, ADMIN_HOST, 'crushtool -c /etc/ceph/crushmap -o /etc/ceph/crushmap.compiled')
248 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd setcrushmap -i /etc/ceph/crushmap.compiled')
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200249 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200250 }
Jakub Josefa63f9862018-01-11 17:58:38 +0100251 */
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200252 }
Jakub Josefa63f9862018-01-11 17:58:38 +0100253}