blob: 2361098a98a655f88134d063905a172b2dfc279e [file] [log] [blame]
Jiri Broulik2c00f4c2017-10-26 13:23:11 +02001/**
2 *
3 * Replace failed disk with a new disk
4 *
5 * Requred parameters:
Jiri Broulika657d562017-11-28 14:19:32 +01006 * SALT_MASTER_URL URL of Salt master
7 * SALT_MASTER_CREDENTIALS Credentials to the Salt API
Jiri Broulik2c00f4c2017-10-26 13:23:11 +02008 *
Jiri Broulika657d562017-11-28 14:19:32 +01009 * HOST Host (minion id) to be removed
10 * ADMIN_HOST Host (minion id) with admin keyring and /etc/crushmap file present
11 * OSD Failed OSD ids to be replaced (comma-separated list - 1,2,3)
12 * DEVICE Comma separated list of failed devices that will be replaced at HOST (/dev/sdb,/dev/sdc)
13 * JOURNAL_BLOCKDB_BLOCKWAL_PARTITION Comma separated list of partitions where journal or block_db or block_wal for the failed devices on this HOST were stored (/dev/sdh2,/dev/sdh3)
14 * CLUSTER_FLAGS Comma separated list of tags to apply to cluster
15 * WAIT_FOR_HEALTHY Wait for cluster rebalance before stoping daemons
Jiri Broulikeb7b82f2017-11-30 13:55:40 +010016 * DMCRYPT Set to True if replacing osds are/were encrypted
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020017 *
18 */
19
20common = new com.mirantis.mk.Common()
21salt = new com.mirantis.mk.Salt()
22def python = new com.mirantis.mk.Python()
23
24def pepperEnv = "pepperEnv"
25def flags = CLUSTER_FLAGS.tokenize(',')
26def osds = OSD.tokenize(',')
27def devices = DEVICE.tokenize(',')
Jiri Broulika657d562017-11-28 14:19:32 +010028def journals_blockdbs_blockwals = JOURNAL_BLOCKDB_BLOCKWAL_PARTITION.tokenize(',')
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020029
30
31def runCephCommand(master, target, cmd) {
32 return salt.cmdRun(master, target, cmd)
33}
34
Jiri Broulik96c867a2017-11-07 16:14:10 +010035def waitForHealthy(master, count=0, attempts=300) {
36 // wait for healthy cluster
37 while (count<attempts) {
38 def health = runCephCommand(master, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
39 if (health.contains('HEALTH_OK')) {
40 common.infoMsg('Cluster is healthy')
41 break;
42 }
43 count++
44 sleep(10)
45 }
46}
Jakub Josefa63f9862018-01-11 17:58:38 +010047timeout(time: 12, unit: 'HOURS') {
48 node("python") {
Jiri Broulik96c867a2017-11-07 16:14:10 +010049
Jakub Josefa63f9862018-01-11 17:58:38 +010050 // create connection to salt master
51 python.setupPepperVirtualenv(pepperEnv, SALT_MASTER_URL, SALT_MASTER_CREDENTIALS)
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020052
Jakub Josefa63f9862018-01-11 17:58:38 +010053 def osd_ids = []
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020054
Jakub Josefa63f9862018-01-11 17:58:38 +010055 for (osd_id in osds) {
56 osd_ids.add('osd.' + osd_id)
57 print("Will delete " + osd_id)
58 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020059
Jakub Josefa63f9862018-01-11 17:58:38 +010060 // `ceph osd out <id> <id>`
61 stage('Set OSDs out') {
62 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd out ' + osd_ids.join(' '))
63 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020064
Jakub Josefa63f9862018-01-11 17:58:38 +010065 // wait for healthy cluster
66 if (WAIT_FOR_HEALTHY.toBoolean() == true) {
67 sleep(5)
68 waitForHealthy(pepperEnv)
69 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020070
Jiri Broulikeb7b82f2017-11-30 13:55:40 +010071
Jakub Josefa63f9862018-01-11 17:58:38 +010072 if (flags.size() > 0) {
73 stage('Set cluster flags') {
74 for (flag in flags) {
75 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd set ' + flag)
76 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +010077 }
78 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +010079
Jakub Josefa63f9862018-01-11 17:58:38 +010080 // stop osd daemons
81 stage('Stop OSD daemons') {
82 for (i in osd_ids) {
83 salt.runSaltProcessStep(pepperEnv, HOST, 'service.stop', ['ceph-osd@' + i.replaceAll('osd.', '')], null, true)
84 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020085 }
Jakub Josefa63f9862018-01-11 17:58:38 +010086 /*
87 // `ceph osd crush remove osd.2`
88 stage('Remove OSDs from CRUSH') {
89 for (i in osd_ids) {
90 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd crush remove ' + i)
91 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020092 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020093
Jakub Josefa63f9862018-01-11 17:58:38 +010094 // wait for pgs to rebalance
95 if (WAIT_FOR_PG_REBALANCE.toBoolean() == true) {
96 stage('Waiting for pgs to rebalance') {
97 while (true) {
98 def status = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph -s')['return'][0].values()[0]
99 if (!status.contains('degraded')) {
100 common.infoMsg('PGs rebalanced')
101 break;
102 }
103 sleep(10)
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200104 }
Jakub Josefa63f9862018-01-11 17:58:38 +0100105 }
106 }
107 */
108 // remove keyring `ceph auth del osd.3`
109 stage('Remove OSD keyrings from auth') {
110 for (i in osd_ids) {
111 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph auth del ' + i)
112 }
113 }
114
115 // remove osd `ceph osd rm osd.3`
116 stage('Remove OSDs') {
117 for (i in osd_ids) {
118 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd rm ' + i)
119 }
120 }
121
122 if (DMCRYPT.toBoolean() == true) {
123
124 // remove partition tables
Jiri Broulika5bc8f62018-01-31 15:04:40 +0100125 stage('dd / zap device') {
Jakub Josefa63f9862018-01-11 17:58:38 +0100126 for (dev in devices) {
Jiri Broulika5bc8f62018-01-31 15:04:40 +0100127 runCephCommand(pepperEnv, HOST, "dd if=/dev/zero of=${dev} bs=4096k count=1 conv=notrunc")
128 try {
129 runCephCommand(pepperEnv, HOST, "sgdisk --zap-all --clear --mbrtogpt -g -- ${dev}")
130 } catch (Exception e) {
131 common.warningMsg(e)
132 }
Jakub Josefa63f9862018-01-11 17:58:38 +0100133 }
134 }
135
136 // remove journal, block_db or block_wal partition `parted /dev/sdj rm 3`
137 stage('Remove journal / block_db / block_wal partitions') {
138 for (partition in journals_blockdbs_blockwals) {
139 if (partition?.trim()) {
140 // dev = /dev/sdi
141 def dev = partition.replaceAll("[0-9]", "")
142 // part_id = 2
Jiri Broulika5bc8f62018-01-31 15:04:40 +0100143 def part_id = partition.substring(partition.lastIndexOf("/")+1).replaceAll("[^0-9]+", "")
Jakub Josefa63f9862018-01-11 17:58:38 +0100144 try {
145 runCephCommand(pepperEnv, HOST, "Ignore | parted ${dev} rm ${part_id}")
146 } catch (Exception e) {
147 common.warningMsg(e)
148 }
149 }
150 }
151 }
152
153 // reboot
154 stage('reboot and wait') {
155 salt.runSaltProcessStep(pepperEnv, HOST, 'system.reboot', null, null, true, 5)
156 salt.minionsReachable(pepperEnv, 'I@salt:master', HOST)
Jiri Broulik99887c82017-10-31 09:27:52 +0100157 sleep(10)
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200158 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200159
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200160
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200161
Jakub Josefa63f9862018-01-11 17:58:38 +0100162 // zap disks `ceph-disk zap /dev/sdi`
163 stage('Zap devices') {
164 for (dev in devices) {
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100165 try {
Jakub Josefa63f9862018-01-11 17:58:38 +0100166 runCephCommand(pepperEnv, HOST, 'ceph-disk zap ' + dev)
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100167 } catch (Exception e) {
168 common.warningMsg(e)
169 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100170 runCephCommand(pepperEnv, HOST, 'ceph-disk zap ' + dev)
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100171 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100172 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100173
Jakub Josefa63f9862018-01-11 17:58:38 +0100174 } else {
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100175
Jakub Josefa63f9862018-01-11 17:58:38 +0100176 // umount `umount /dev/sdi1`
177 stage('Umount devices') {
178 for (dev in devices) {
179 runCephCommand(pepperEnv, HOST, 'umount ' + dev + '1')
180 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100181 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100182
Jakub Josefa63f9862018-01-11 17:58:38 +0100183 // zap disks `ceph-disk zap /dev/sdi`
184 stage('Zap devices') {
185 for (dev in devices) {
186 runCephCommand(pepperEnv, HOST, 'ceph-disk zap ' + dev)
187 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100188 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100189
Jakub Josefa63f9862018-01-11 17:58:38 +0100190 // remove journal, block_db or block_wal partition `parted /dev/sdj rm 3`
191 stage('Remove journal / block_db / block_wal partitions') {
192 for (partition in journals_blockdbs_blockwals) {
193 if (partition?.trim()) {
194 // dev = /dev/sdi
195 def dev = partition.replaceAll("[0-9]", "")
196 // part_id = 2
197 def part_id = partition.substring(partition.lastIndexOf("/")+1).replaceAll("[^0-9]", "")
198 try {
199 runCephCommand(pepperEnv, HOST, "parted ${dev} rm ${part_id}")
200 } catch (Exception e) {
201 common.warningMsg(e)
202 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100203 }
204 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200205 }
206 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200207
Jakub Josefa63f9862018-01-11 17:58:38 +0100208 // Deploy failed Ceph OSD
209 stage('Deploy Ceph OSD') {
210 salt.enforceState(pepperEnv, HOST, 'ceph.osd', true)
211 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200212
Jakub Josefa63f9862018-01-11 17:58:38 +0100213 // remove cluster flags
214 if (flags.size() > 0) {
215 stage('Unset cluster flags') {
216 for (flag in flags) {
217 common.infoMsg('Removing flag ' + flag)
218 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd unset ' + flag)
219 }
Jiri Broulikdc87d722017-11-03 15:43:22 +0100220 }
221 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200222
Jakub Josefa63f9862018-01-11 17:58:38 +0100223 /*
224 if (ENFORCE_CRUSHMAP.toBoolean() == true) {
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200225
Jakub Josefa63f9862018-01-11 17:58:38 +0100226 // enforce crushmap `crushtool -c /etc/ceph/crushmap -o /etc/ceph/crushmap.compiled; ceph osd setcrushmap -i /etc/ceph/crushmap.compiled`
227 stage('Enforce crushmap') {
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200228
Jakub Josefa63f9862018-01-11 17:58:38 +0100229 stage('Ask for manual confirmation') {
230 input message: "Are you sure that your ADMIN_HOST has correct /etc/ceph/crushmap file? Click proceed to compile and enforce crushmap."
231 }
232 runCephCommand(pepperEnv, ADMIN_HOST, 'crushtool -c /etc/ceph/crushmap -o /etc/ceph/crushmap.compiled')
233 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd setcrushmap -i /etc/ceph/crushmap.compiled')
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200234 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200235 }
Jakub Josefa63f9862018-01-11 17:58:38 +0100236 */
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200237 }
Jakub Josefa63f9862018-01-11 17:58:38 +0100238}