blob: 8da74d567b16bf6cb191c68544d6a55b2b9030a8 [file] [log] [blame]
Martin Polreichf89f9b42019-05-07 15:37:13 +02001package com.mirantis.mk
2
3/**
4 *
5 * Galera functions
6 *
7 */
8
9
10/**
11 * Returns parameters from mysql.status output on given target node
12 *
13 * @param env Salt Connection object or pepperEnv
14 * @param target Targeted node
15 * @param parameters Parameters to be retruned (String or list of Strings). If no parameters are provided or is set to '[]', it returns all of them.
16 * @return result List of parameters with its values
17 */
18
19def getWsrepParameters(env, target, parameters=[], print=false) {
20 result = []
21 out = salt.runSaltProcessStep(env, "${target}", "mysql.status", [], null, false)
22 outlist = out['return'][0]
23 resultYaml = outlist.get(outlist.keySet()[0]).sort()
24 if (print) {
25 common.prettyPrint(resultYaml)
26 }
27 if (parameters instanceof String) {
28 value = resultYaml[key]
29 if (value instanceof String && value.isBigDecimal()) {
30 value = value.toBigDecimal()
31 }
32 result = [key: value]
33 } else {
34 if (parameters == []) {
35 result = resultYaml
36 } else {
37 for (key in parameters) {
38 value = resultYaml[key]
39 if (value instanceof String && value.isBigDecimal()) {
40 value = value.toBigDecimal()
41 }
42 result << [key: value]
43 }
44 }
45 }
46 return result
47}
48
49/**
50 * Verifies Galera database
51 *
52 * This function checks for Galera master, tests connection and if reachable, it obtains the result
53 * of Salt mysql.status function. The result is then parsed, validated and outputed to the user.
54 *
55 * @param env Salt Connection object or pepperEnv
56 * @param slave Boolean value to enable slave checking (if master in unreachable)
57 * @param checkTimeSync Boolean value to enable time sync check
58 * @return resultCode int values used to determine exit status in the calling function
59 */
60def verifyGaleraStatus(env, slave=false, checkTimeSync=false) {
61 def salt = new com.mirantis.mk.Salt()
62 def common = new com.mirantis.mk.Common()
63 def out = ""
64 def status = "unknown"
65 def testNode = ""
66 if (!slave) {
67 try {
68 galeraMaster = salt.getMinions(env, "I@galera:master")
69 common.infoMsg("Current Galera master is: ${galeraMaster}")
70 salt.minionsReachable(env, "I@salt:master", "I@galera:master")
71 testNode = "I@galera:master"
72 } catch (Exception e) {
73 common.errorMsg('Galera master is not reachable.')
Martin Polreich7c8ac9a2019-05-16 13:41:09 +020074 common.errorMsg(e.getMessage())
Martin Polreichf89f9b42019-05-07 15:37:13 +020075 return 128
76 }
77 } else {
78 try {
79 galeraSlaves = salt.getMinions(env, "I@galera:slave")
80 common.infoMsg("Testing Galera slave minions: ${galeraSlaves}")
81 } catch (Exception e) {
82 common.errorMsg("Cannot obtain Galera slave minions list.")
Martin Polreich7c8ac9a2019-05-16 13:41:09 +020083 common.errorMsg(e.getMessage())
Martin Polreichf89f9b42019-05-07 15:37:13 +020084 return 129
85 }
86 for (minion in galeraSlaves) {
87 try {
88 salt.minionsReachable(env, "I@salt:master", minion)
89 testNode = minion
90 break
91 } catch (Exception e) {
92 common.warningMsg("Slave '${minion}' is not reachable.")
93 }
94 }
95 }
96 if (!testNode) {
97 common.errorMsg("No Galera slave was reachable.")
98 return 130
99 }
Martin Polreichf48bb102019-04-02 14:12:58 +0200100 def checkTargets = salt.getMinions(env, "I@xtrabackup:client or I@xtrabackup:server")
101 for (checkTarget in checkTargets) {
102 def nodeStatus = salt.minionsReachable(env, 'I@salt:master', checkTarget, null, 10, 5)
103 if (nodeStatus != null) {
104 def iostatRes = salt.getIostatValues(['saltId': env, 'target': checkTarget, 'parameterName': "%util", 'output': true])
105 if (iostatRes == [:]) {
106 common.errorMsg("Recevived empty response from iostat call on ${checkTarget}. Maybe 'sysstat' package is not installed?")
107 return 140
108 }
109 for (int i = 0; i < iostatRes.size(); i++) {
110 def diskKey = iostatRes.keySet()[i]
111 if (!(iostatRes[diskKey].toString().isBigDecimal() && (iostatRes[diskKey].toBigDecimal() < 50 ))) {
112 common.errorMsg("Disk ${diskKey} has to high i/o utilization. Maximum value is 50 and current value is ${iostatRes[diskKey]}.")
113 return 141
114 }
115 }
116 }
117 }
118 common.infoMsg("Disk i/o utilization was checked and everything seems to be in order.")
Martin Polreichf89f9b42019-05-07 15:37:13 +0200119 if (checkTimeSync && !salt.checkClusterTimeSync(env, "I@galera:master or I@galera:slave")) {
120 common.errorMsg("Time in cluster is desynchronized or it couldn't be detemined. You should fix this issue manually before proceeding.")
121 return 131
122 }
123 try {
124 out = salt.runSaltProcessStep(env, "${testNode}", "mysql.status", [], null, false)
125 } catch (Exception e) {
126 common.errorMsg('Could not determine mysql status.')
Martin Polreich7c8ac9a2019-05-16 13:41:09 +0200127 common.errorMsg(e.getMessage())
Martin Polreichf89f9b42019-05-07 15:37:13 +0200128 return 256
129 }
130 if (out) {
131 try {
132 status = validateAndPrintGaleraStatusReport(env, out, testNode)
133 } catch (Exception e) {
134 common.errorMsg('Could not parse the mysql status output. Check it manually.')
Martin Polreich7c8ac9a2019-05-16 13:41:09 +0200135 common.errorMsg(e.getMessage())
Martin Polreichf89f9b42019-05-07 15:37:13 +0200136 return 1
137 }
138 } else {
139 common.errorMsg("Mysql status response unrecognized or is empty. Response: ${out}")
140 return 1024
141 }
142 if (status == "OK") {
143 common.infoMsg("No errors found - MySQL status is ${status}.")
144 return 0
145 } else if (status == "unknown") {
146 common.warningMsg('MySQL status cannot be detemined')
147 return 1
148 } else {
149 common.errorMsg("Errors found.")
150 return 2
151 }
152}
153
154/** Validates and prints result of verifyGaleraStatus function
155@param env Salt Connection object or pepperEnv
156@param out Output of the mysql.status Salt function
157@return status "OK", "ERROR" or "uknown" depending on result of validation
158*/
159
160def validateAndPrintGaleraStatusReport(env, out, minion) {
161 def salt = new com.mirantis.mk.Salt()
162 def common = new com.mirantis.mk.Common()
163 if (minion == "I@galera:master") {
164 role = "master"
165 } else {
166 role = "slave"
167 }
168 sizeOut = salt.getReturnValues(salt.getPillar(env, minion, "galera:${role}:members"))
169 expected_cluster_size = sizeOut.size()
170 outlist = out['return'][0]
171 resultYaml = outlist.get(outlist.keySet()[0]).sort()
172 common.prettyPrint(resultYaml)
173 parameters = [
174 wsrep_cluster_status: [title: 'Cluster status', expectedValues: ['Primary'], description: ''],
175 wsrep_cluster_size: [title: 'Current cluster size', expectedValues: [expected_cluster_size], description: ''],
176 wsrep_ready: [title: 'Node status', expectedValues: ['ON', true], description: ''],
177 wsrep_local_state_comment: [title: 'Node status comment', expectedValues: ['Joining', 'Waiting on SST', 'Joined', 'Synced', 'Donor'], description: ''],
178 wsrep_connected: [title: 'Node connectivity', expectedValues: ['ON', true], description: ''],
179 wsrep_local_recv_queue_avg: [title: 'Average size of local reveived queue', expectedThreshold: [warn: 0.5, error: 1.0], description: '(Value above 0 means that the node cannot apply write-sets as fast as it receives them, which can lead to replication throttling)'],
180 wsrep_local_send_queue_avg: [title: 'Average size of local send queue', expectedThreshold: [warn: 0.5, error: 1.0], description: '(Value above 0 indicate replication throttling or network throughput issues, such as a bottleneck on the network link.)']
181 ]
182 for (key in parameters.keySet()) {
183 value = resultYaml[key]
184 if (value instanceof String && value.isBigDecimal()) {
185 value = value.toBigDecimal()
186 }
187 parameters.get(key) << [actualValue: value]
188 }
189 for (key in parameters.keySet()) {
190 param = parameters.get(key)
191 if (key == 'wsrep_local_recv_queue_avg' || key == 'wsrep_local_send_queue_avg') {
Martin Polreichfb026be2019-05-16 13:36:23 +0200192 if (param.get('actualValue') == null || (param.get('actualValue') > param.get('expectedThreshold').get('error'))) {
Martin Polreichf89f9b42019-05-07 15:37:13 +0200193 param << [match: 'error']
194 } else if (param.get('actualValue') > param.get('expectedThreshold').get('warn')) {
195 param << [match: 'warn']
196 } else {
197 param << [match: 'ok']
198 }
199 } else {
200 for (expValue in param.get('expectedValues')) {
201 if (expValue == param.get('actualValue')) {
202 param << [match: 'ok']
203 break
204 } else {
205 param << [match: 'error']
206 }
207 }
208 }
209 }
210 cluster_info_report = []
211 cluster_warning_report = []
212 cluster_error_report = []
213 for (key in parameters.keySet()) {
214 param = parameters.get(key)
215 if (param.containsKey('expectedThreshold')) {
216 expValues = "below ${param.get('expectedThreshold').get('warn')}"
217 } else {
218 if (param.get('expectedValues').size() > 1) {
219 expValues = param.get('expectedValues').join(' or ')
220 } else {
221 expValues = param.get('expectedValues')[0]
222 }
223 }
224 reportString = "${param.title}: ${param.actualValue} (Expected: ${expValues}) ${param.description}"
225 if (param.get('match').equals('ok')) {
226 cluster_info_report.add("[OK ] ${reportString}")
227 } else if (param.get('match').equals('warn')) {
228 cluster_warning_report.add("[WARNING] ${reportString}")
229 } else {
230 cluster_error_report.add("[ ERROR] ${reportString})")
231 }
232 }
233 common.infoMsg("CLUSTER STATUS REPORT: ${cluster_info_report.size()} expected values, ${cluster_warning_report.size()} warnings and ${cluster_error_report.size()} error found:")
234 if (cluster_info_report.size() > 0) {
235 common.infoMsg(cluster_info_report.join('\n'))
236 }
237 if (cluster_warning_report.size() > 0) {
238 common.warningMsg(cluster_warning_report.join('\n'))
239 }
240 if (cluster_error_report.size() > 0) {
241 common.errorMsg(cluster_error_report.join('\n'))
242 return "ERROR"
243 } else {
244 return "OK"
245 }
246}
247
248def getGaleraLastShutdownNode(env) {
249 def salt = new com.mirantis.mk.Salt()
250 def common = new com.mirantis.mk.Common()
251 members = ''
252 lastNode = [ip: '', seqno: -2]
253 try {
254 members = salt.getReturnValues(salt.getPillar(env, "I@galera:master", "galera:master:members"))
Martin Polreich7c8ac9a2019-05-16 13:41:09 +0200255 } catch (Exception e) {
Martin Polreichf89f9b42019-05-07 15:37:13 +0200256 common.errorMsg('Could not retrieve members list')
Martin Polreich7c8ac9a2019-05-16 13:41:09 +0200257 common.errorMsg(e.getMessage())
Martin Polreichf89f9b42019-05-07 15:37:13 +0200258 return 'I@galera:master'
259 }
260 if (members) {
261 for (member in members) {
262 try {
263 salt.minionsReachable(env, 'I@salt:master', "S@${member.host}")
264 out = salt.getReturnValues(salt.cmdRun(env, "S@${member.host}", 'cat /var/lib/mysql/grastate.dat | grep "seqno" | cut -d ":" -f2', true, null, false))
265 seqno = out.tokenize('\n')[0].trim()
266 if (seqno.isNumber()) {
267 seqno = seqno.toInteger()
268 } else {
269 seqno = -2
270 }
271 highestSeqno = lastNode.get('seqno')
272 if (seqno > highestSeqno) {
273 lastNode << [ip: "${member.host}", seqno: seqno]
274 }
Martin Polreich7c8ac9a2019-05-16 13:41:09 +0200275 } catch (Exception e) {
Martin Polreichf89f9b42019-05-07 15:37:13 +0200276 common.warningMsg("Could not determine 'seqno' value for node ${member.host} ")
Martin Polreich7c8ac9a2019-05-16 13:41:09 +0200277 common.warningMsg(e.getMessage())
Martin Polreichf89f9b42019-05-07 15:37:13 +0200278 }
279 }
280 }
281 if (lastNode.get('ip') != '') {
282 return "S@${lastNode.ip}"
283 } else {
284 return "I@galera:master"
285 }
286}
287
288/**
Martin Polreiche48741b2019-03-21 16:00:23 +0100289 * Restores Galera cluster
290 * @param env Salt Connection object or pepperEnv
291 * @param runRestoreDb Boolean to determine if the restoration of DB should be run as well
Martin Polreichf89f9b42019-05-07 15:37:13 +0200292 * @return output of salt commands
293 */
Martin Polreiche48741b2019-03-21 16:00:23 +0100294def restoreGaleraCluster(env, runRestoreDb=true) {
Martin Polreichf89f9b42019-05-07 15:37:13 +0200295 def salt = new com.mirantis.mk.Salt()
296 def common = new com.mirantis.mk.Common()
297 try {
298 salt.runSaltProcessStep(env, 'I@galera:slave', 'service.stop', ['mysql'])
299 } catch (Exception er) {
300 common.warningMsg('Mysql service already stopped')
301 }
302 try {
303 salt.runSaltProcessStep(env, 'I@galera:master', 'service.stop', ['mysql'])
304 } catch (Exception er) {
305 common.warningMsg('Mysql service already stopped')
306 }
307 lastNodeTarget = getGaleraLastShutdownNode(env)
308 try {
309 salt.cmdRun(env, 'I@galera:slave', "rm /var/lib/mysql/ib_logfile*")
310 } catch (Exception er) {
311 common.warningMsg('Files are not present')
312 }
313 try {
314 salt.cmdRun(env, 'I@galera:slave', "rm /var/lib/mysql/grastate.dat")
315 } catch (Exception er) {
316 common.warningMsg('Files are not present')
317 }
318 try {
319 salt.cmdRun(env, lastNodeTarget, "mkdir /root/mysql/mysql.bak")
320 } catch (Exception er) {
321 common.warningMsg('Directory already exists')
322 }
323 try {
324 salt.cmdRun(env, lastNodeTarget, "rm -rf /root/mysql/mysql.bak/*")
325 } catch (Exception er) {
326 common.warningMsg('Directory already empty')
327 }
328 try {
329 salt.cmdRun(env, lastNodeTarget, "mv /var/lib/mysql/* /root/mysql/mysql.bak")
330 } catch (Exception er) {
331 common.warningMsg('Files were already moved')
332 }
333 try {
334 salt.runSaltProcessStep(env, lastNodeTarget, 'file.remove', ["/var/lib/mysql/.galera_bootstrap"])
335 } catch (Exception er) {
336 common.warningMsg('File is not present')
337 }
Martin Polreiche48741b2019-03-21 16:00:23 +0100338
339 // make sure that gcom parameter is empty
Martin Polreichf89f9b42019-05-07 15:37:13 +0200340 salt.cmdRun(env, lastNodeTarget, "sed -i '/gcomm/c\\wsrep_cluster_address=\"gcomm://\"' /etc/mysql/my.cnf")
Martin Polreiche48741b2019-03-21 16:00:23 +0100341
342 // run restore of DB
343 if (runRestoreDb) {
344 restoreGaleraDb(env, lastNodeTarget)
345 }
346
347 // start mysql service on the last node
Martin Polreichf89f9b42019-05-07 15:37:13 +0200348 salt.runSaltProcessStep(env, lastNodeTarget, 'service.start', ['mysql'])
349
350 // wait until mysql service on galera master is up
351 try {
352 salt.commandStatus(env, lastNodeTarget, 'service mysql status', 'running')
353 } catch (Exception er) {
354 input message: "Database is not running please fix it first and only then click on PROCEED."
355 }
356
Martin Polreiche48741b2019-03-21 16:00:23 +0100357 // start mysql services on the rest of the nodes
Martin Polreichf89f9b42019-05-07 15:37:13 +0200358 salt.runSaltProcessStep(env, "I@galera:master and not ${lastNodeTarget}", 'service.start', ['mysql'])
359 salt.runSaltProcessStep(env, "I@galera:slave and not ${lastNodeTarget}", 'service.start', ['mysql'])
Martin Polreiche48741b2019-03-21 16:00:23 +0100360
361 // wait until mysql service on the rest of the nodes is up
362 try {
363 salt.commandStatus(env, "( I@galera:master or I@galera:slave ) and not ${lastNodeTarget}", 'service mysql status', 'running')
364 } catch (Exception er) {
365 input message: "Database is not running please fix it first and only then click on PROCEED."
366 }
367
368 // apply any changes in configuration
369 salt.enforceState(env, lastNodeTarget, 'galera')
Martin Polreichf89f9b42019-05-07 15:37:13 +0200370}
Martin Polreiche48741b2019-03-21 16:00:23 +0100371
372/**
373 * Restores Galera database
374 * @param env Salt Connection object or pepperEnv
375 * @param targetNode Node to be targeted
376 */
377def restoreGaleraDb(env, targetNode) {
378 def backup_dir = salt.getReturnValues(salt.getPillar(env, targetNode, 'xtrabackup:client:backup_dir'))
379 if(backup_dir == null || backup_dir.isEmpty()) { backup_dir='/var/backups/mysql/xtrabackup' }
380 salt.runSaltProcessStep(env, targetNode, 'file.remove', ["${backup_dir}/dbrestored"])
381 salt.cmdRun(env, 'I@xtrabackup:client', "su root -c 'salt-call state.sls xtrabackup'")
382}
383
384def restoreGaleraDb(env) {
385 common.warningMsg("This method was renamed to 'restoreGaleraCluster'. Please change your pipeline to use this call instead! If you think that you really wanted to call 'restoreGaleraDb' you may be missing 'targetNode' parameter in you call.")
386 return restoreGaleraCluster(env)
sgarbuzf2063462019-07-17 12:01:57 +0300387}
388
389/**
390 * Start first node in mysql cluster. Cluster members stay removed in mysql config, additional service restart will be needed once all nodes are up.
391 * https://docs.mirantis.com/mcp/q4-18/mcp-operations-guide/tshooting/
392 * tshoot-mcp-openstack/tshoot-galera/restore-galera-cluster/
393 * restore-galera-manually.html#restore-galera-manually
394 *
395 * @param env Salt Connection object or pepperEnv
396 * @param target last stopped Galera node
397 * @return output of salt commands
398 */
399def startFirstNode(env, target) {
400 def salt = new com.mirantis.mk.Salt()
401 def common = new com.mirantis.mk.Common()
402
403 // make sure that gcom parameter is empty
404 salt.cmdRun(env, target, "sed -i '/wsrep_cluster_address/ s/^#*/#/' /etc/mysql/my.cnf")
405 salt.cmdRun(env, target, "sed -i '/wsrep_cluster_address/a wsrep_cluster_address=\"gcomm://\"' /etc/mysql/my.cnf")
406
407 // start mysql service on the last node
408 salt.runSaltProcessStep(env, target, 'service.start', ['mysql'])
409
410 // wait until mysql service on the last node is up
411
412 common.retry(30, 10) {
413 value = getWsrepParameters(env, target, 'wsrep_evs_state')
414 if (value['wsrep_evs_state'] == 'OPERATIONAL') {
415 common.infoMsg('WSREP state: OPERATIONAL')
416 } else {
417 throw new Exception("Mysql service is not running please fix it.")
418 }
419 }
Martin Polreiche48741b2019-03-21 16:00:23 +0100420}