blob: 73191635ae547169b3ea8f410d364e677c559f5e [file] [log] [blame]
Martin Polreich8f0f3ac2019-02-15 10:03:33 +01001package com.mirantis.mk
2
3/**
4 *
5 * Galera functions
6 *
7 */
8
9
10/**
11 * Returns parameters from mysql.status output on given target node
12 *
13 * @param env Salt Connection object or pepperEnv
14 * @param target Targeted node
15 * @param parameters Parameters to be retruned (String or list of Strings). If no parameters are provided or is set to '[]', it returns all of them.
16 * @return result List of parameters with its values
17 */
18
19def getWsrepParameters(env, target, parameters=[], print=false) {
Martin Polreich1281cde2019-02-28 11:39:49 +010020 def salt = new com.mirantis.mk.Salt()
21 def common = new com.mirantis.mk.Common()
22 result = [:]
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010023 out = salt.runSaltProcessStep(env, "${target}", "mysql.status", [], null, false)
24 outlist = out['return'][0]
25 resultYaml = outlist.get(outlist.keySet()[0]).sort()
26 if (print) {
27 common.prettyPrint(resultYaml)
28 }
29 if (parameters instanceof String) {
Martin Polreich1fbda522019-02-26 14:46:33 +010030 parameters = [parameters]
31 }
32 if (parameters == [] || parameters == ['']) {
33 result = resultYaml
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010034 } else {
Martin Polreich1281cde2019-02-28 11:39:49 +010035 for (String param in parameters) {
36 value = resultYaml[param]
Martin Polreich1fbda522019-02-26 14:46:33 +010037 if (value instanceof String && value.isBigDecimal()) {
38 value = value.toBigDecimal()
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010039 }
Martin Polreich1281cde2019-02-28 11:39:49 +010040 result[param] = value
Martin Polreich1fbda522019-02-26 14:46:33 +010041 }
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010042 }
43 return result
44}
45
46/**
47 * Verifies Galera database
48 *
49 * This function checks for Galera master, tests connection and if reachable, it obtains the result
50 * of Salt mysql.status function. The result is then parsed, validated and outputed to the user.
51 *
52 * @param env Salt Connection object or pepperEnv
53 * @param slave Boolean value to enable slave checking (if master in unreachable)
54 * @param checkTimeSync Boolean value to enable time sync check
55 * @return resultCode int values used to determine exit status in the calling function
56 */
57def verifyGaleraStatus(env, slave=false, checkTimeSync=false) {
58 def salt = new com.mirantis.mk.Salt()
59 def common = new com.mirantis.mk.Common()
60 def out = ""
61 def status = "unknown"
62 def testNode = ""
63 if (!slave) {
64 try {
65 galeraMaster = salt.getMinions(env, "I@galera:master")
66 common.infoMsg("Current Galera master is: ${galeraMaster}")
67 salt.minionsReachable(env, "I@salt:master", "I@galera:master")
68 testNode = "I@galera:master"
69 } catch (Exception e) {
70 common.errorMsg('Galera master is not reachable.')
Martin Polreich71307db2019-04-03 09:14:51 +020071 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010072 return 128
73 }
74 } else {
75 try {
76 galeraSlaves = salt.getMinions(env, "I@galera:slave")
77 common.infoMsg("Testing Galera slave minions: ${galeraSlaves}")
78 } catch (Exception e) {
79 common.errorMsg("Cannot obtain Galera slave minions list.")
Martin Polreich71307db2019-04-03 09:14:51 +020080 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010081 return 129
82 }
Martin Polreich5df75782019-02-19 14:29:24 +010083 for (minion in galeraSlaves) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010084 try {
Martin Polreich5df75782019-02-19 14:29:24 +010085 salt.minionsReachable(env, "I@salt:master", minion)
86 testNode = minion
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010087 break
88 } catch (Exception e) {
Martin Polreich5df75782019-02-19 14:29:24 +010089 common.warningMsg("Slave '${minion}' is not reachable.")
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010090 }
91 }
92 }
93 if (!testNode) {
94 common.errorMsg("No Galera slave was reachable.")
95 return 130
96 }
Martin Polreich047257f2019-05-02 11:36:57 +020097 def checkTargets = salt.getMinions(env, "I@xtrabackup:client or I@xtrabackup:server")
Martin Polreichda7ff402019-04-02 14:12:58 +020098 for (checkTarget in checkTargets) {
Martin Polreich047257f2019-05-02 11:36:57 +020099 def nodeStatus = salt.minionsReachable(env, 'I@salt:master', checkTarget, null, 10, 5)
100 if (nodeStatus != null) {
101 def iostatRes = salt.getIostatValues(['saltId': env, 'target': checkTarget, 'parameterName': "%util", 'output': true])
102 if (iostatRes == [:]) {
103 common.errorMsg("Recevived empty response from iostat call on ${checkTarget}. Maybe 'sysstat' package is not installed?")
104 return 140
105 }
106 for (int i = 0; i < iostatRes.size(); i++) {
107 def diskKey = iostatRes.keySet()[i]
Martin Polreich74f51752019-07-08 12:06:04 +0200108 if (!(iostatRes[diskKey].toString().isBigDecimal() && (iostatRes[diskKey].toBigDecimal() < 50 ))) {
109 common.errorMsg("Disk ${diskKey} has to high i/o utilization. Maximum value is 50 and current value is ${iostatRes[diskKey]}.")
Martin Polreich047257f2019-05-02 11:36:57 +0200110 return 141
111 }
Martin Polreichda7ff402019-04-02 14:12:58 +0200112 }
113 }
114 }
115 common.infoMsg("Disk i/o utilization was checked and everything seems to be in order.")
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100116 if (checkTimeSync && !salt.checkClusterTimeSync(env, "I@galera:master or I@galera:slave")) {
117 common.errorMsg("Time in cluster is desynchronized or it couldn't be detemined. You should fix this issue manually before proceeding.")
118 return 131
119 }
120 try {
121 out = salt.runSaltProcessStep(env, "${testNode}", "mysql.status", [], null, false)
122 } catch (Exception e) {
123 common.errorMsg('Could not determine mysql status.')
Martin Polreich71307db2019-04-03 09:14:51 +0200124 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100125 return 256
126 }
127 if (out) {
128 try {
129 status = validateAndPrintGaleraStatusReport(env, out, testNode)
130 } catch (Exception e) {
131 common.errorMsg('Could not parse the mysql status output. Check it manually.')
Martin Polreich71307db2019-04-03 09:14:51 +0200132 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100133 return 1
134 }
135 } else {
136 common.errorMsg("Mysql status response unrecognized or is empty. Response: ${out}")
137 return 1024
138 }
139 if (status == "OK") {
140 common.infoMsg("No errors found - MySQL status is ${status}.")
141 return 0
142 } else if (status == "unknown") {
143 common.warningMsg('MySQL status cannot be detemined')
144 return 1
145 } else {
146 common.errorMsg("Errors found.")
147 return 2
148 }
149}
150
151/** Validates and prints result of verifyGaleraStatus function
152@param env Salt Connection object or pepperEnv
153@param out Output of the mysql.status Salt function
154@return status "OK", "ERROR" or "uknown" depending on result of validation
155*/
156
157def validateAndPrintGaleraStatusReport(env, out, minion) {
158 def salt = new com.mirantis.mk.Salt()
159 def common = new com.mirantis.mk.Common()
160 if (minion == "I@galera:master") {
161 role = "master"
162 } else {
163 role = "slave"
164 }
165 sizeOut = salt.getReturnValues(salt.getPillar(env, minion, "galera:${role}:members"))
166 expected_cluster_size = sizeOut.size()
167 outlist = out['return'][0]
168 resultYaml = outlist.get(outlist.keySet()[0]).sort()
169 common.prettyPrint(resultYaml)
170 parameters = [
171 wsrep_cluster_status: [title: 'Cluster status', expectedValues: ['Primary'], description: ''],
172 wsrep_cluster_size: [title: 'Current cluster size', expectedValues: [expected_cluster_size], description: ''],
173 wsrep_ready: [title: 'Node status', expectedValues: ['ON', true], description: ''],
174 wsrep_local_state_comment: [title: 'Node status comment', expectedValues: ['Joining', 'Waiting on SST', 'Joined', 'Synced', 'Donor'], description: ''],
175 wsrep_connected: [title: 'Node connectivity', expectedValues: ['ON', true], description: ''],
176 wsrep_local_recv_queue_avg: [title: 'Average size of local reveived queue', expectedThreshold: [warn: 0.5, error: 1.0], description: '(Value above 0 means that the node cannot apply write-sets as fast as it receives them, which can lead to replication throttling)'],
177 wsrep_local_send_queue_avg: [title: 'Average size of local send queue', expectedThreshold: [warn: 0.5, error: 1.0], description: '(Value above 0 indicate replication throttling or network throughput issues, such as a bottleneck on the network link.)']
178 ]
179 for (key in parameters.keySet()) {
180 value = resultYaml[key]
181 if (value instanceof String && value.isBigDecimal()) {
182 value = value.toBigDecimal()
183 }
184 parameters.get(key) << [actualValue: value]
185 }
186 for (key in parameters.keySet()) {
187 param = parameters.get(key)
188 if (key == 'wsrep_local_recv_queue_avg' || key == 'wsrep_local_send_queue_avg') {
Martin Polreich79810262019-02-25 12:51:11 +0100189 if (param.get('actualValue') == null || (param.get('actualValue') > param.get('expectedThreshold').get('error'))) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100190 param << [match: 'error']
191 } else if (param.get('actualValue') > param.get('expectedThreshold').get('warn')) {
192 param << [match: 'warn']
193 } else {
194 param << [match: 'ok']
195 }
196 } else {
197 for (expValue in param.get('expectedValues')) {
198 if (expValue == param.get('actualValue')) {
199 param << [match: 'ok']
200 break
201 } else {
202 param << [match: 'error']
203 }
204 }
205 }
206 }
207 cluster_info_report = []
208 cluster_warning_report = []
209 cluster_error_report = []
210 for (key in parameters.keySet()) {
211 param = parameters.get(key)
212 if (param.containsKey('expectedThreshold')) {
213 expValues = "below ${param.get('expectedThreshold').get('warn')}"
214 } else {
215 if (param.get('expectedValues').size() > 1) {
216 expValues = param.get('expectedValues').join(' or ')
217 } else {
218 expValues = param.get('expectedValues')[0]
219 }
220 }
221 reportString = "${param.title}: ${param.actualValue} (Expected: ${expValues}) ${param.description}"
222 if (param.get('match').equals('ok')) {
223 cluster_info_report.add("[OK ] ${reportString}")
224 } else if (param.get('match').equals('warn')) {
225 cluster_warning_report.add("[WARNING] ${reportString}")
226 } else {
227 cluster_error_report.add("[ ERROR] ${reportString})")
228 }
229 }
230 common.infoMsg("CLUSTER STATUS REPORT: ${cluster_info_report.size()} expected values, ${cluster_warning_report.size()} warnings and ${cluster_error_report.size()} error found:")
231 if (cluster_info_report.size() > 0) {
232 common.infoMsg(cluster_info_report.join('\n'))
233 }
234 if (cluster_warning_report.size() > 0) {
235 common.warningMsg(cluster_warning_report.join('\n'))
236 }
237 if (cluster_error_report.size() > 0) {
238 common.errorMsg(cluster_error_report.join('\n'))
239 return "ERROR"
240 } else {
241 return "OK"
242 }
243}
244
Martin Polreich1281cde2019-02-28 11:39:49 +0100245/** Returns last shutdown node of Galera cluster
246@param env Salt Connection object or pepperEnv
247@param nodes List of nodes to check only (defaults to []). If not provided, it will check all nodes.
248 Use this parameter if the cluster splits to several components and you only want to check one fo them.
249@return status ip address or hostname of last shutdown node
250*/
251
252def getGaleraLastShutdownNode(env, nodes = []) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100253 def salt = new com.mirantis.mk.Salt()
254 def common = new com.mirantis.mk.Common()
Martin Polreich1281cde2019-02-28 11:39:49 +0100255 members = []
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100256 lastNode = [ip: '', seqno: -2]
257 try {
Martin Polreich1281cde2019-02-28 11:39:49 +0100258 if (nodes) {
259 nodes = salt.getIPAddressesForNodenames(env, nodes)
260 for (node in nodes) {
261 members = [host: "${node.get(node.keySet()[0])}"] + members
262 }
263 } else {
264 members = salt.getReturnValues(salt.getPillar(env, "I@galera:master", "galera:master:members"))
265 }
Martin Polreich71307db2019-04-03 09:14:51 +0200266 } catch (Exception e) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100267 common.errorMsg('Could not retrieve members list')
Martin Polreich71307db2019-04-03 09:14:51 +0200268 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100269 return 'I@galera:master'
270 }
271 if (members) {
272 for (member in members) {
273 try {
274 salt.minionsReachable(env, 'I@salt:master', "S@${member.host}")
275 out = salt.getReturnValues(salt.cmdRun(env, "S@${member.host}", 'cat /var/lib/mysql/grastate.dat | grep "seqno" | cut -d ":" -f2', true, null, false))
276 seqno = out.tokenize('\n')[0].trim()
277 if (seqno.isNumber()) {
278 seqno = seqno.toInteger()
279 } else {
280 seqno = -2
281 }
282 highestSeqno = lastNode.get('seqno')
283 if (seqno > highestSeqno) {
284 lastNode << [ip: "${member.host}", seqno: seqno]
285 }
Martin Polreich71307db2019-04-03 09:14:51 +0200286 } catch (Exception e) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100287 common.warningMsg("Could not determine 'seqno' value for node ${member.host} ")
Martin Polreich71307db2019-04-03 09:14:51 +0200288 common.warningMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100289 }
290 }
291 }
292 if (lastNode.get('ip') != '') {
293 return "S@${lastNode.ip}"
294 } else {
295 return "I@galera:master"
296 }
297}
298
299/**
Martin Polreich9044fe42019-03-21 16:00:23 +0100300 * Restores Galera cluster
301 * @param env Salt Connection object or pepperEnv
302 * @param runRestoreDb Boolean to determine if the restoration of DB should be run as well
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100303 * @return output of salt commands
304 */
Martin Polreich9044fe42019-03-21 16:00:23 +0100305def restoreGaleraCluster(env, runRestoreDb=true) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100306 def salt = new com.mirantis.mk.Salt()
307 def common = new com.mirantis.mk.Common()
Martin Polreich7ddaea72019-08-06 16:51:27 +0200308 salt.runSaltProcessStep(env, 'I@galera:slave', 'service.stop', ['mysql'])
309 salt.runSaltProcessStep(env, 'I@galera:master', 'service.stop', ['mysql'])
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100310 lastNodeTarget = getGaleraLastShutdownNode(env)
Martin Polreich7ddaea72019-08-06 16:51:27 +0200311 salt.cmdRun(env, "( I@galera:master or I@galera:slave ) and not ${lastNodeTarget}", "rm -f /var/lib/mysql/ib_logfile*")
312 salt.cmdRun(env, "( I@galera:master or I@galera:slave ) and not ${lastNodeTarget}", "rm -f /var/lib/mysql/grastate.dat")
313 if (runRestoreDb) {
314 salt.cmdRun(env, lastNodeTarget, "mkdir -p /root/mysql/mysql.bak")
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100315 salt.cmdRun(env, lastNodeTarget, "rm -rf /root/mysql/mysql.bak/*")
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100316 salt.cmdRun(env, lastNodeTarget, "mv /var/lib/mysql/* /root/mysql/mysql.bak")
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100317 }
Martin Polreich7ddaea72019-08-06 16:51:27 +0200318 salt.cmdRun(env, lastNodeTarget, "rm -f /var/lib/mysql/.galera_bootstrap")
Martin Polreich9044fe42019-03-21 16:00:23 +0100319
Martin Polreich1949d0c2019-07-08 11:12:58 +0200320 // make sure that gcom parameter is empty
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100321 salt.cmdRun(env, lastNodeTarget, "sed -i '/gcomm/c\\wsrep_cluster_address=\"gcomm://\"' /etc/mysql/my.cnf")
Martin Polreich9044fe42019-03-21 16:00:23 +0100322
Martin Polreich1949d0c2019-07-08 11:12:58 +0200323 // run restore of DB
Martin Polreich9044fe42019-03-21 16:00:23 +0100324 if (runRestoreDb) {
325 restoreGaleraDb(env, lastNodeTarget)
326 }
327
Martin Polreich1949d0c2019-07-08 11:12:58 +0200328 // start mysql service on the last node
329 salt.runSaltProcessStep(env, lastNodeTarget, 'service.start', ['mysql'])
Martin Polreich9044fe42019-03-21 16:00:23 +0100330
Martin Polreich1949d0c2019-07-08 11:12:58 +0200331 // wait until mysql service on the last node is up
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100332 try {
333 salt.commandStatus(env, lastNodeTarget, 'service mysql status', 'running')
334 } catch (Exception er) {
335 input message: "Database is not running please fix it first and only then click on PROCEED."
336 }
337
Martin Polreich1949d0c2019-07-08 11:12:58 +0200338 // start mysql services on the rest of the nodes
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100339 salt.runSaltProcessStep(env, "I@galera:master and not ${lastNodeTarget}", 'service.start', ['mysql'])
340 salt.runSaltProcessStep(env, "I@galera:slave and not ${lastNodeTarget}", 'service.start', ['mysql'])
Martin Polreich1949d0c2019-07-08 11:12:58 +0200341
342 // wait until mysql service on the rest of the nodes is up
343 try {
344 salt.commandStatus(env, "( I@galera:master or I@galera:slave ) and not ${lastNodeTarget}", 'service mysql status', 'running')
345 } catch (Exception er) {
346 input message: "Database is not running please fix it first and only then click on PROCEED."
347 }
348
Martin Polreich7ddaea72019-08-06 16:51:27 +0200349 // apply any changes in configuration and return value to gcom parameter
350 salt.enforceState(['saltId': env, 'target': lastNodeTarget, 'state': 'galera'])
Martin Polreich1949d0c2019-07-08 11:12:58 +0200351
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100352}
Martin Polreich9044fe42019-03-21 16:00:23 +0100353
354/**
355 * Restores Galera database
356 * @param env Salt Connection object or pepperEnv
357 * @param targetNode Node to be targeted
358 */
359def restoreGaleraDb(env, targetNode) {
Martin Polreichef778c32019-08-08 13:47:45 +0200360 def salt = new com.mirantis.mk.Salt()
Martin Polreich9044fe42019-03-21 16:00:23 +0100361 def backup_dir = salt.getReturnValues(salt.getPillar(env, targetNode, 'xtrabackup:client:backup_dir'))
362 if(backup_dir == null || backup_dir.isEmpty()) { backup_dir='/var/backups/mysql/xtrabackup' }
363 salt.runSaltProcessStep(env, targetNode, 'file.remove', ["${backup_dir}/dbrestored"])
Martin Polreichef778c32019-08-08 13:47:45 +0200364 salt.enforceState(['saltId': env, 'target': targetNode, 'state': 'xtrabackup.client'])
365 salt.enforceState(['saltId': env, 'target': targetNode, 'state': 'xtrabackup.client.restore'])
Martin Polreich9044fe42019-03-21 16:00:23 +0100366}
367
368def restoreGaleraDb(env) {
Martin Polreichef778c32019-08-08 13:47:45 +0200369 def common = new com.mirantis.mk.Common()
Martin Polreich9044fe42019-03-21 16:00:23 +0100370 common.warningMsg("This method was renamed to 'restoreGaleraCluster'. Please change your pipeline to use this call instead! If you think that you really wanted to call 'restoreGaleraDb' you may be missing 'targetNode' parameter in you call.")
371 return restoreGaleraCluster(env)
sgarbuz5e10ba22019-07-17 12:01:57 +0300372}
373
374/**
375 * Start first node in mysql cluster. Cluster members stay removed in mysql config, additional service restart will be needed once all nodes are up.
376 * https://docs.mirantis.com/mcp/q4-18/mcp-operations-guide/tshooting/
377 * tshoot-mcp-openstack/tshoot-galera/restore-galera-cluster/
378 * restore-galera-manually.html#restore-galera-manually
379 *
380 * @param env Salt Connection object or pepperEnv
381 * @param target last stopped Galera node
382 * @return output of salt commands
383 */
384def startFirstNode(env, target) {
385 def salt = new com.mirantis.mk.Salt()
386 def common = new com.mirantis.mk.Common()
387
388 // make sure that gcom parameter is empty
389 salt.cmdRun(env, target, "sed -i '/wsrep_cluster_address/ s/^#*/#/' /etc/mysql/my.cnf")
390 salt.cmdRun(env, target, "sed -i '/wsrep_cluster_address/a wsrep_cluster_address=\"gcomm://\"' /etc/mysql/my.cnf")
391
392 // start mysql service on the last node
393 salt.runSaltProcessStep(env, target, 'service.start', ['mysql'])
394
395 // wait until mysql service on the last node is up
396
397 common.retry(30, 10) {
398 value = getWsrepParameters(env, target, 'wsrep_evs_state')
399 if (value['wsrep_evs_state'] == 'OPERATIONAL') {
400 common.infoMsg('WSREP state: OPERATIONAL')
401 } else {
402 throw new Exception("Mysql service is not running please fix it.")
403 }
404 }
Martin Polreich9044fe42019-03-21 16:00:23 +0100405}