blob: 93b65732defafc8ea0d4ea5b7d2b4bd4ed679570 [file] [log] [blame]
Jiri Broulik2c00f4c2017-10-26 13:23:11 +02001/**
2 *
3 * Replace failed disk with a new disk
4 *
5 * Requred parameters:
Jiri Broulika657d562017-11-28 14:19:32 +01006 * SALT_MASTER_URL URL of Salt master
7 * SALT_MASTER_CREDENTIALS Credentials to the Salt API
Jiri Broulik2c00f4c2017-10-26 13:23:11 +02008 *
Jiri Broulika657d562017-11-28 14:19:32 +01009 * HOST Host (minion id) to be removed
10 * ADMIN_HOST Host (minion id) with admin keyring and /etc/crushmap file present
11 * OSD Failed OSD ids to be replaced (comma-separated list - 1,2,3)
12 * DEVICE Comma separated list of failed devices that will be replaced at HOST (/dev/sdb,/dev/sdc)
13 * JOURNAL_BLOCKDB_BLOCKWAL_PARTITION Comma separated list of partitions where journal or block_db or block_wal for the failed devices on this HOST were stored (/dev/sdh2,/dev/sdh3)
14 * CLUSTER_FLAGS Comma separated list of tags to apply to cluster
15 * WAIT_FOR_HEALTHY Wait for cluster rebalance before stoping daemons
Jiri Broulikeb7b82f2017-11-30 13:55:40 +010016 * DMCRYPT Set to True if replacing osds are/were encrypted
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020017 *
18 */
19
20common = new com.mirantis.mk.Common()
21salt = new com.mirantis.mk.Salt()
22def python = new com.mirantis.mk.Python()
23
24def pepperEnv = "pepperEnv"
25def flags = CLUSTER_FLAGS.tokenize(',')
26def osds = OSD.tokenize(',')
27def devices = DEVICE.tokenize(',')
Jiri Broulika657d562017-11-28 14:19:32 +010028def journals_blockdbs_blockwals = JOURNAL_BLOCKDB_BLOCKWAL_PARTITION.tokenize(',')
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020029
30
31def runCephCommand(master, target, cmd) {
32 return salt.cmdRun(master, target, cmd)
33}
34
Jiri Broulik96c867a2017-11-07 16:14:10 +010035def waitForHealthy(master, count=0, attempts=300) {
36 // wait for healthy cluster
37 while (count<attempts) {
38 def health = runCephCommand(master, ADMIN_HOST, 'ceph health')['return'][0].values()[0]
39 if (health.contains('HEALTH_OK')) {
40 common.infoMsg('Cluster is healthy')
41 break;
42 }
43 count++
44 sleep(10)
45 }
46}
Jakub Josefa63f9862018-01-11 17:58:38 +010047timeout(time: 12, unit: 'HOURS') {
48 node("python") {
Jiri Broulik96c867a2017-11-07 16:14:10 +010049
Jakub Josefa63f9862018-01-11 17:58:38 +010050 // create connection to salt master
51 python.setupPepperVirtualenv(pepperEnv, SALT_MASTER_URL, SALT_MASTER_CREDENTIALS)
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020052
Jakub Josefa63f9862018-01-11 17:58:38 +010053 def osd_ids = []
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020054
Jakub Josefa63f9862018-01-11 17:58:38 +010055 for (osd_id in osds) {
56 osd_ids.add('osd.' + osd_id)
57 print("Will delete " + osd_id)
58 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020059
Jakub Josefa63f9862018-01-11 17:58:38 +010060 // `ceph osd out <id> <id>`
61 stage('Set OSDs out') {
62 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd out ' + osd_ids.join(' '))
63 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020064
Jakub Josefa63f9862018-01-11 17:58:38 +010065 // wait for healthy cluster
66 if (WAIT_FOR_HEALTHY.toBoolean() == true) {
67 sleep(5)
68 waitForHealthy(pepperEnv)
69 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020070
Jiri Broulikeb7b82f2017-11-30 13:55:40 +010071
Jakub Josefa63f9862018-01-11 17:58:38 +010072 if (flags.size() > 0) {
73 stage('Set cluster flags') {
74 for (flag in flags) {
75 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd set ' + flag)
76 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +010077 }
78 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +010079
Jakub Josefa63f9862018-01-11 17:58:38 +010080 // stop osd daemons
81 stage('Stop OSD daemons') {
82 for (i in osd_ids) {
83 salt.runSaltProcessStep(pepperEnv, HOST, 'service.stop', ['ceph-osd@' + i.replaceAll('osd.', '')], null, true)
84 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020085 }
Jakub Josefa63f9862018-01-11 17:58:38 +010086 /*
87 // `ceph osd crush remove osd.2`
88 stage('Remove OSDs from CRUSH') {
89 for (i in osd_ids) {
90 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd crush remove ' + i)
91 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020092 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +020093
Jakub Josefa63f9862018-01-11 17:58:38 +010094 // wait for pgs to rebalance
95 if (WAIT_FOR_PG_REBALANCE.toBoolean() == true) {
96 stage('Waiting for pgs to rebalance') {
97 while (true) {
98 def status = runCephCommand(pepperEnv, ADMIN_HOST, 'ceph -s')['return'][0].values()[0]
99 if (!status.contains('degraded')) {
100 common.infoMsg('PGs rebalanced')
101 break;
102 }
103 sleep(10)
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200104 }
Jakub Josefa63f9862018-01-11 17:58:38 +0100105 }
106 }
107 */
108 // remove keyring `ceph auth del osd.3`
109 stage('Remove OSD keyrings from auth') {
110 for (i in osd_ids) {
111 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph auth del ' + i)
112 }
113 }
114
115 // remove osd `ceph osd rm osd.3`
116 stage('Remove OSDs') {
117 for (i in osd_ids) {
118 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd rm ' + i)
119 }
120 }
121
122 if (DMCRYPT.toBoolean() == true) {
123
124 // remove partition tables
125 stage('dd part tables') {
126 for (dev in devices) {
127 runCephCommand(pepperEnv, HOST, "dd if=/dev/zero of=${dev} bs=512 count=1 conv=notrunc")
128 }
129 }
130
131 // remove journal, block_db or block_wal partition `parted /dev/sdj rm 3`
132 stage('Remove journal / block_db / block_wal partitions') {
133 for (partition in journals_blockdbs_blockwals) {
134 if (partition?.trim()) {
135 // dev = /dev/sdi
136 def dev = partition.replaceAll("[0-9]", "")
137 // part_id = 2
138 def part_id = partition.substring(partition.lastIndexOf("/")+1).replaceAll("[^0-9]", "")
139 try {
140 runCephCommand(pepperEnv, HOST, "Ignore | parted ${dev} rm ${part_id}")
141 } catch (Exception e) {
142 common.warningMsg(e)
143 }
144 }
145 }
146 }
147
148 // reboot
149 stage('reboot and wait') {
150 salt.runSaltProcessStep(pepperEnv, HOST, 'system.reboot', null, null, true, 5)
151 salt.minionsReachable(pepperEnv, 'I@salt:master', HOST)
Jiri Broulik99887c82017-10-31 09:27:52 +0100152 sleep(10)
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200153 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200154
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200155
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200156
Jakub Josefa63f9862018-01-11 17:58:38 +0100157 // zap disks `ceph-disk zap /dev/sdi`
158 stage('Zap devices') {
159 for (dev in devices) {
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100160 try {
Jakub Josefa63f9862018-01-11 17:58:38 +0100161 runCephCommand(pepperEnv, HOST, 'ceph-disk zap ' + dev)
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100162 } catch (Exception e) {
163 common.warningMsg(e)
164 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100165 runCephCommand(pepperEnv, HOST, 'ceph-disk zap ' + dev)
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100166 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100167 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100168
Jakub Josefa63f9862018-01-11 17:58:38 +0100169 } else {
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100170
Jakub Josefa63f9862018-01-11 17:58:38 +0100171 // umount `umount /dev/sdi1`
172 stage('Umount devices') {
173 for (dev in devices) {
174 runCephCommand(pepperEnv, HOST, 'umount ' + dev + '1')
175 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100176 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100177
Jakub Josefa63f9862018-01-11 17:58:38 +0100178 // zap disks `ceph-disk zap /dev/sdi`
179 stage('Zap devices') {
180 for (dev in devices) {
181 runCephCommand(pepperEnv, HOST, 'ceph-disk zap ' + dev)
182 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100183 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100184
Jakub Josefa63f9862018-01-11 17:58:38 +0100185 // remove journal, block_db or block_wal partition `parted /dev/sdj rm 3`
186 stage('Remove journal / block_db / block_wal partitions') {
187 for (partition in journals_blockdbs_blockwals) {
188 if (partition?.trim()) {
189 // dev = /dev/sdi
190 def dev = partition.replaceAll("[0-9]", "")
191 // part_id = 2
192 def part_id = partition.substring(partition.lastIndexOf("/")+1).replaceAll("[^0-9]", "")
193 try {
194 runCephCommand(pepperEnv, HOST, "parted ${dev} rm ${part_id}")
195 } catch (Exception e) {
196 common.warningMsg(e)
197 }
Jiri Broulikeb7b82f2017-11-30 13:55:40 +0100198 }
199 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200200 }
201 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200202
Jakub Josefa63f9862018-01-11 17:58:38 +0100203 // Deploy failed Ceph OSD
204 stage('Deploy Ceph OSD') {
205 salt.enforceState(pepperEnv, HOST, 'ceph.osd', true)
206 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200207
Jakub Josefa63f9862018-01-11 17:58:38 +0100208 // remove cluster flags
209 if (flags.size() > 0) {
210 stage('Unset cluster flags') {
211 for (flag in flags) {
212 common.infoMsg('Removing flag ' + flag)
213 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd unset ' + flag)
214 }
Jiri Broulikdc87d722017-11-03 15:43:22 +0100215 }
216 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200217
Jakub Josefa63f9862018-01-11 17:58:38 +0100218 /*
219 if (ENFORCE_CRUSHMAP.toBoolean() == true) {
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200220
Jakub Josefa63f9862018-01-11 17:58:38 +0100221 // enforce crushmap `crushtool -c /etc/ceph/crushmap -o /etc/ceph/crushmap.compiled; ceph osd setcrushmap -i /etc/ceph/crushmap.compiled`
222 stage('Enforce crushmap') {
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200223
Jakub Josefa63f9862018-01-11 17:58:38 +0100224 stage('Ask for manual confirmation') {
225 input message: "Are you sure that your ADMIN_HOST has correct /etc/ceph/crushmap file? Click proceed to compile and enforce crushmap."
226 }
227 runCephCommand(pepperEnv, ADMIN_HOST, 'crushtool -c /etc/ceph/crushmap -o /etc/ceph/crushmap.compiled')
228 runCephCommand(pepperEnv, ADMIN_HOST, 'ceph osd setcrushmap -i /etc/ceph/crushmap.compiled')
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200229 }
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200230 }
Jakub Josefa63f9862018-01-11 17:58:38 +0100231 */
Jiri Broulik2c00f4c2017-10-26 13:23:11 +0200232 }
Jakub Josefa63f9862018-01-11 17:58:38 +0100233}