blob: e6a34c1a00abfa0a9fc86b50228254f0a8e39c0c [file] [log] [blame]
Martin Polreich8f0f3ac2019-02-15 10:03:33 +01001package com.mirantis.mk
2
3/**
4 *
5 * Galera functions
6 *
7 */
8
9
10/**
11 * Returns parameters from mysql.status output on given target node
12 *
13 * @param env Salt Connection object or pepperEnv
14 * @param target Targeted node
15 * @param parameters Parameters to be retruned (String or list of Strings). If no parameters are provided or is set to '[]', it returns all of them.
16 * @return result List of parameters with its values
17 */
18
19def getWsrepParameters(env, target, parameters=[], print=false) {
Martin Polreich1281cde2019-02-28 11:39:49 +010020 def salt = new com.mirantis.mk.Salt()
21 def common = new com.mirantis.mk.Common()
22 result = [:]
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010023 out = salt.runSaltProcessStep(env, "${target}", "mysql.status", [], null, false)
24 outlist = out['return'][0]
25 resultYaml = outlist.get(outlist.keySet()[0]).sort()
26 if (print) {
27 common.prettyPrint(resultYaml)
28 }
29 if (parameters instanceof String) {
Martin Polreich1fbda522019-02-26 14:46:33 +010030 parameters = [parameters]
31 }
32 if (parameters == [] || parameters == ['']) {
33 result = resultYaml
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010034 } else {
Martin Polreich1281cde2019-02-28 11:39:49 +010035 for (String param in parameters) {
36 value = resultYaml[param]
Martin Polreich1fbda522019-02-26 14:46:33 +010037 if (value instanceof String && value.isBigDecimal()) {
38 value = value.toBigDecimal()
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010039 }
Martin Polreich1281cde2019-02-28 11:39:49 +010040 result[param] = value
Martin Polreich1fbda522019-02-26 14:46:33 +010041 }
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010042 }
43 return result
44}
45
46/**
47 * Verifies Galera database
48 *
49 * This function checks for Galera master, tests connection and if reachable, it obtains the result
50 * of Salt mysql.status function. The result is then parsed, validated and outputed to the user.
51 *
52 * @param env Salt Connection object or pepperEnv
53 * @param slave Boolean value to enable slave checking (if master in unreachable)
54 * @param checkTimeSync Boolean value to enable time sync check
55 * @return resultCode int values used to determine exit status in the calling function
56 */
57def verifyGaleraStatus(env, slave=false, checkTimeSync=false) {
58 def salt = new com.mirantis.mk.Salt()
59 def common = new com.mirantis.mk.Common()
60 def out = ""
61 def status = "unknown"
62 def testNode = ""
63 if (!slave) {
64 try {
65 galeraMaster = salt.getMinions(env, "I@galera:master")
66 common.infoMsg("Current Galera master is: ${galeraMaster}")
67 salt.minionsReachable(env, "I@salt:master", "I@galera:master")
68 testNode = "I@galera:master"
69 } catch (Exception e) {
70 common.errorMsg('Galera master is not reachable.')
Martin Polreich71307db2019-04-03 09:14:51 +020071 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010072 return 128
73 }
74 } else {
75 try {
76 galeraSlaves = salt.getMinions(env, "I@galera:slave")
77 common.infoMsg("Testing Galera slave minions: ${galeraSlaves}")
78 } catch (Exception e) {
79 common.errorMsg("Cannot obtain Galera slave minions list.")
Martin Polreich71307db2019-04-03 09:14:51 +020080 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010081 return 129
82 }
Martin Polreich5df75782019-02-19 14:29:24 +010083 for (minion in galeraSlaves) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010084 try {
Martin Polreich5df75782019-02-19 14:29:24 +010085 salt.minionsReachable(env, "I@salt:master", minion)
86 testNode = minion
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010087 break
88 } catch (Exception e) {
Martin Polreich5df75782019-02-19 14:29:24 +010089 common.warningMsg("Slave '${minion}' is not reachable.")
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010090 }
91 }
92 }
93 if (!testNode) {
94 common.errorMsg("No Galera slave was reachable.")
95 return 130
96 }
Martin Polreich047257f2019-05-02 11:36:57 +020097 def checkTargets = salt.getMinions(env, "I@xtrabackup:client or I@xtrabackup:server")
Martin Polreichda7ff402019-04-02 14:12:58 +020098 for (checkTarget in checkTargets) {
Martin Polreich047257f2019-05-02 11:36:57 +020099 def nodeStatus = salt.minionsReachable(env, 'I@salt:master', checkTarget, null, 10, 5)
100 if (nodeStatus != null) {
101 def iostatRes = salt.getIostatValues(['saltId': env, 'target': checkTarget, 'parameterName': "%util", 'output': true])
102 if (iostatRes == [:]) {
103 common.errorMsg("Recevived empty response from iostat call on ${checkTarget}. Maybe 'sysstat' package is not installed?")
104 return 140
105 }
106 for (int i = 0; i < iostatRes.size(); i++) {
107 def diskKey = iostatRes.keySet()[i]
Martin Polreich74f51752019-07-08 12:06:04 +0200108 if (!(iostatRes[diskKey].toString().isBigDecimal() && (iostatRes[diskKey].toBigDecimal() < 50 ))) {
109 common.errorMsg("Disk ${diskKey} has to high i/o utilization. Maximum value is 50 and current value is ${iostatRes[diskKey]}.")
Martin Polreich047257f2019-05-02 11:36:57 +0200110 return 141
111 }
Martin Polreichda7ff402019-04-02 14:12:58 +0200112 }
113 }
114 }
115 common.infoMsg("Disk i/o utilization was checked and everything seems to be in order.")
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100116 if (checkTimeSync && !salt.checkClusterTimeSync(env, "I@galera:master or I@galera:slave")) {
117 common.errorMsg("Time in cluster is desynchronized or it couldn't be detemined. You should fix this issue manually before proceeding.")
118 return 131
119 }
120 try {
121 out = salt.runSaltProcessStep(env, "${testNode}", "mysql.status", [], null, false)
122 } catch (Exception e) {
123 common.errorMsg('Could not determine mysql status.')
Martin Polreich71307db2019-04-03 09:14:51 +0200124 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100125 return 256
126 }
127 if (out) {
128 try {
129 status = validateAndPrintGaleraStatusReport(env, out, testNode)
130 } catch (Exception e) {
131 common.errorMsg('Could not parse the mysql status output. Check it manually.')
Martin Polreich71307db2019-04-03 09:14:51 +0200132 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100133 return 1
134 }
135 } else {
136 common.errorMsg("Mysql status response unrecognized or is empty. Response: ${out}")
137 return 1024
138 }
139 if (status == "OK") {
140 common.infoMsg("No errors found - MySQL status is ${status}.")
141 return 0
142 } else if (status == "unknown") {
143 common.warningMsg('MySQL status cannot be detemined')
144 return 1
145 } else {
146 common.errorMsg("Errors found.")
147 return 2
148 }
149}
150
151/** Validates and prints result of verifyGaleraStatus function
152@param env Salt Connection object or pepperEnv
153@param out Output of the mysql.status Salt function
154@return status "OK", "ERROR" or "uknown" depending on result of validation
155*/
156
157def validateAndPrintGaleraStatusReport(env, out, minion) {
158 def salt = new com.mirantis.mk.Salt()
159 def common = new com.mirantis.mk.Common()
160 if (minion == "I@galera:master") {
161 role = "master"
162 } else {
163 role = "slave"
164 }
165 sizeOut = salt.getReturnValues(salt.getPillar(env, minion, "galera:${role}:members"))
166 expected_cluster_size = sizeOut.size()
167 outlist = out['return'][0]
168 resultYaml = outlist.get(outlist.keySet()[0]).sort()
169 common.prettyPrint(resultYaml)
170 parameters = [
171 wsrep_cluster_status: [title: 'Cluster status', expectedValues: ['Primary'], description: ''],
172 wsrep_cluster_size: [title: 'Current cluster size', expectedValues: [expected_cluster_size], description: ''],
173 wsrep_ready: [title: 'Node status', expectedValues: ['ON', true], description: ''],
174 wsrep_local_state_comment: [title: 'Node status comment', expectedValues: ['Joining', 'Waiting on SST', 'Joined', 'Synced', 'Donor'], description: ''],
175 wsrep_connected: [title: 'Node connectivity', expectedValues: ['ON', true], description: ''],
176 wsrep_local_recv_queue_avg: [title: 'Average size of local reveived queue', expectedThreshold: [warn: 0.5, error: 1.0], description: '(Value above 0 means that the node cannot apply write-sets as fast as it receives them, which can lead to replication throttling)'],
177 wsrep_local_send_queue_avg: [title: 'Average size of local send queue', expectedThreshold: [warn: 0.5, error: 1.0], description: '(Value above 0 indicate replication throttling or network throughput issues, such as a bottleneck on the network link.)']
178 ]
179 for (key in parameters.keySet()) {
180 value = resultYaml[key]
181 if (value instanceof String && value.isBigDecimal()) {
182 value = value.toBigDecimal()
183 }
184 parameters.get(key) << [actualValue: value]
185 }
186 for (key in parameters.keySet()) {
187 param = parameters.get(key)
188 if (key == 'wsrep_local_recv_queue_avg' || key == 'wsrep_local_send_queue_avg') {
Martin Polreich79810262019-02-25 12:51:11 +0100189 if (param.get('actualValue') == null || (param.get('actualValue') > param.get('expectedThreshold').get('error'))) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100190 param << [match: 'error']
191 } else if (param.get('actualValue') > param.get('expectedThreshold').get('warn')) {
192 param << [match: 'warn']
193 } else {
194 param << [match: 'ok']
195 }
196 } else {
197 for (expValue in param.get('expectedValues')) {
198 if (expValue == param.get('actualValue')) {
199 param << [match: 'ok']
200 break
201 } else {
202 param << [match: 'error']
203 }
204 }
205 }
206 }
207 cluster_info_report = []
208 cluster_warning_report = []
209 cluster_error_report = []
210 for (key in parameters.keySet()) {
211 param = parameters.get(key)
212 if (param.containsKey('expectedThreshold')) {
213 expValues = "below ${param.get('expectedThreshold').get('warn')}"
214 } else {
215 if (param.get('expectedValues').size() > 1) {
216 expValues = param.get('expectedValues').join(' or ')
217 } else {
218 expValues = param.get('expectedValues')[0]
219 }
220 }
221 reportString = "${param.title}: ${param.actualValue} (Expected: ${expValues}) ${param.description}"
222 if (param.get('match').equals('ok')) {
223 cluster_info_report.add("[OK ] ${reportString}")
224 } else if (param.get('match').equals('warn')) {
225 cluster_warning_report.add("[WARNING] ${reportString}")
226 } else {
227 cluster_error_report.add("[ ERROR] ${reportString})")
228 }
229 }
230 common.infoMsg("CLUSTER STATUS REPORT: ${cluster_info_report.size()} expected values, ${cluster_warning_report.size()} warnings and ${cluster_error_report.size()} error found:")
231 if (cluster_info_report.size() > 0) {
232 common.infoMsg(cluster_info_report.join('\n'))
233 }
234 if (cluster_warning_report.size() > 0) {
235 common.warningMsg(cluster_warning_report.join('\n'))
236 }
237 if (cluster_error_report.size() > 0) {
238 common.errorMsg(cluster_error_report.join('\n'))
239 return "ERROR"
240 } else {
241 return "OK"
242 }
243}
244
Martin Polreich1281cde2019-02-28 11:39:49 +0100245/** Returns last shutdown node of Galera cluster
246@param env Salt Connection object or pepperEnv
247@param nodes List of nodes to check only (defaults to []). If not provided, it will check all nodes.
248 Use this parameter if the cluster splits to several components and you only want to check one fo them.
249@return status ip address or hostname of last shutdown node
250*/
251
252def getGaleraLastShutdownNode(env, nodes = []) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100253 def salt = new com.mirantis.mk.Salt()
254 def common = new com.mirantis.mk.Common()
Martin Polreich1281cde2019-02-28 11:39:49 +0100255 members = []
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100256 lastNode = [ip: '', seqno: -2]
257 try {
Martin Polreich1281cde2019-02-28 11:39:49 +0100258 if (nodes) {
259 nodes = salt.getIPAddressesForNodenames(env, nodes)
260 for (node in nodes) {
261 members = [host: "${node.get(node.keySet()[0])}"] + members
262 }
263 } else {
264 members = salt.getReturnValues(salt.getPillar(env, "I@galera:master", "galera:master:members"))
265 }
Martin Polreich71307db2019-04-03 09:14:51 +0200266 } catch (Exception e) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100267 common.errorMsg('Could not retrieve members list')
Martin Polreich71307db2019-04-03 09:14:51 +0200268 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100269 return 'I@galera:master'
270 }
271 if (members) {
272 for (member in members) {
273 try {
274 salt.minionsReachable(env, 'I@salt:master', "S@${member.host}")
275 out = salt.getReturnValues(salt.cmdRun(env, "S@${member.host}", 'cat /var/lib/mysql/grastate.dat | grep "seqno" | cut -d ":" -f2', true, null, false))
276 seqno = out.tokenize('\n')[0].trim()
277 if (seqno.isNumber()) {
278 seqno = seqno.toInteger()
279 } else {
280 seqno = -2
281 }
282 highestSeqno = lastNode.get('seqno')
283 if (seqno > highestSeqno) {
284 lastNode << [ip: "${member.host}", seqno: seqno]
285 }
Martin Polreich71307db2019-04-03 09:14:51 +0200286 } catch (Exception e) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100287 common.warningMsg("Could not determine 'seqno' value for node ${member.host} ")
Martin Polreich71307db2019-04-03 09:14:51 +0200288 common.warningMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100289 }
290 }
291 }
292 if (lastNode.get('ip') != '') {
293 return "S@${lastNode.ip}"
294 } else {
295 return "I@galera:master"
296 }
297}
298
299/**
Martin Polreich9044fe42019-03-21 16:00:23 +0100300 * Restores Galera cluster
301 * @param env Salt Connection object or pepperEnv
302 * @param runRestoreDb Boolean to determine if the restoration of DB should be run as well
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100303 * @return output of salt commands
304 */
Martin Polreich9044fe42019-03-21 16:00:23 +0100305def restoreGaleraCluster(env, runRestoreDb=true) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100306 def salt = new com.mirantis.mk.Salt()
307 def common = new com.mirantis.mk.Common()
308 try {
309 salt.runSaltProcessStep(env, 'I@galera:slave', 'service.stop', ['mysql'])
310 } catch (Exception er) {
311 common.warningMsg('Mysql service already stopped')
312 }
313 try {
314 salt.runSaltProcessStep(env, 'I@galera:master', 'service.stop', ['mysql'])
315 } catch (Exception er) {
316 common.warningMsg('Mysql service already stopped')
317 }
318 lastNodeTarget = getGaleraLastShutdownNode(env)
319 try {
320 salt.cmdRun(env, 'I@galera:slave', "rm /var/lib/mysql/ib_logfile*")
321 } catch (Exception er) {
322 common.warningMsg('Files are not present')
323 }
324 try {
325 salt.cmdRun(env, 'I@galera:slave', "rm /var/lib/mysql/grastate.dat")
326 } catch (Exception er) {
327 common.warningMsg('Files are not present')
328 }
329 try {
330 salt.cmdRun(env, lastNodeTarget, "mkdir /root/mysql/mysql.bak")
331 } catch (Exception er) {
332 common.warningMsg('Directory already exists')
333 }
334 try {
335 salt.cmdRun(env, lastNodeTarget, "rm -rf /root/mysql/mysql.bak/*")
336 } catch (Exception er) {
337 common.warningMsg('Directory already empty')
338 }
339 try {
340 salt.cmdRun(env, lastNodeTarget, "mv /var/lib/mysql/* /root/mysql/mysql.bak")
341 } catch (Exception er) {
342 common.warningMsg('Files were already moved')
343 }
344 try {
345 salt.runSaltProcessStep(env, lastNodeTarget, 'file.remove', ["/var/lib/mysql/.galera_bootstrap"])
346 } catch (Exception er) {
347 common.warningMsg('File is not present')
348 }
Martin Polreich9044fe42019-03-21 16:00:23 +0100349
Martin Polreich1949d0c2019-07-08 11:12:58 +0200350 // make sure that gcom parameter is empty
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100351 salt.cmdRun(env, lastNodeTarget, "sed -i '/gcomm/c\\wsrep_cluster_address=\"gcomm://\"' /etc/mysql/my.cnf")
Martin Polreich9044fe42019-03-21 16:00:23 +0100352
Martin Polreich1949d0c2019-07-08 11:12:58 +0200353 // run restore of DB
Martin Polreich9044fe42019-03-21 16:00:23 +0100354 if (runRestoreDb) {
355 restoreGaleraDb(env, lastNodeTarget)
356 }
357
Martin Polreich1949d0c2019-07-08 11:12:58 +0200358 // start mysql service on the last node
359 salt.runSaltProcessStep(env, lastNodeTarget, 'service.start', ['mysql'])
Martin Polreich9044fe42019-03-21 16:00:23 +0100360
Martin Polreich1949d0c2019-07-08 11:12:58 +0200361 // wait until mysql service on the last node is up
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100362 try {
363 salt.commandStatus(env, lastNodeTarget, 'service mysql status', 'running')
364 } catch (Exception er) {
365 input message: "Database is not running please fix it first and only then click on PROCEED."
366 }
367
Martin Polreich1949d0c2019-07-08 11:12:58 +0200368 // start mysql services on the rest of the nodes
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100369 salt.runSaltProcessStep(env, "I@galera:master and not ${lastNodeTarget}", 'service.start', ['mysql'])
370 salt.runSaltProcessStep(env, "I@galera:slave and not ${lastNodeTarget}", 'service.start', ['mysql'])
Martin Polreich1949d0c2019-07-08 11:12:58 +0200371
372 // wait until mysql service on the rest of the nodes is up
373 try {
374 salt.commandStatus(env, "( I@galera:master or I@galera:slave ) and not ${lastNodeTarget}", 'service mysql status', 'running')
375 } catch (Exception er) {
376 input message: "Database is not running please fix it first and only then click on PROCEED."
377 }
378
379 // apply any changes in configuration
380 salt.enforceState(env, lastNodeTarget, 'galera')
381
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100382}
Martin Polreich9044fe42019-03-21 16:00:23 +0100383
384/**
385 * Restores Galera database
386 * @param env Salt Connection object or pepperEnv
387 * @param targetNode Node to be targeted
388 */
389def restoreGaleraDb(env, targetNode) {
390 def backup_dir = salt.getReturnValues(salt.getPillar(env, targetNode, 'xtrabackup:client:backup_dir'))
391 if(backup_dir == null || backup_dir.isEmpty()) { backup_dir='/var/backups/mysql/xtrabackup' }
392 salt.runSaltProcessStep(env, targetNode, 'file.remove', ["${backup_dir}/dbrestored"])
393 salt.cmdRun(env, 'I@xtrabackup:client', "su root -c 'salt-call state.sls xtrabackup'")
394}
395
396def restoreGaleraDb(env) {
397 common.warningMsg("This method was renamed to 'restoreGaleraCluster'. Please change your pipeline to use this call instead! If you think that you really wanted to call 'restoreGaleraDb' you may be missing 'targetNode' parameter in you call.")
398 return restoreGaleraCluster(env)
sgarbuz5e10ba22019-07-17 12:01:57 +0300399}
400
401/**
402 * Start first node in mysql cluster. Cluster members stay removed in mysql config, additional service restart will be needed once all nodes are up.
403 * https://docs.mirantis.com/mcp/q4-18/mcp-operations-guide/tshooting/
404 * tshoot-mcp-openstack/tshoot-galera/restore-galera-cluster/
405 * restore-galera-manually.html#restore-galera-manually
406 *
407 * @param env Salt Connection object or pepperEnv
408 * @param target last stopped Galera node
409 * @return output of salt commands
410 */
411def startFirstNode(env, target) {
412 def salt = new com.mirantis.mk.Salt()
413 def common = new com.mirantis.mk.Common()
414
415 // make sure that gcom parameter is empty
416 salt.cmdRun(env, target, "sed -i '/wsrep_cluster_address/ s/^#*/#/' /etc/mysql/my.cnf")
417 salt.cmdRun(env, target, "sed -i '/wsrep_cluster_address/a wsrep_cluster_address=\"gcomm://\"' /etc/mysql/my.cnf")
418
419 // start mysql service on the last node
420 salt.runSaltProcessStep(env, target, 'service.start', ['mysql'])
421
422 // wait until mysql service on the last node is up
423
424 common.retry(30, 10) {
425 value = getWsrepParameters(env, target, 'wsrep_evs_state')
426 if (value['wsrep_evs_state'] == 'OPERATIONAL') {
427 common.infoMsg('WSREP state: OPERATIONAL')
428 } else {
429 throw new Exception("Mysql service is not running please fix it.")
430 }
431 }
Martin Polreich9044fe42019-03-21 16:00:23 +0100432}