blob: 3a10a1c7ed211658d4eacfafb9a341a8ef8b2298 [file] [log] [blame]
Martin Polreich8f0f3ac2019-02-15 10:03:33 +01001package com.mirantis.mk
2
3/**
4 *
5 * Galera functions
6 *
7 */
8
9
10/**
11 * Returns parameters from mysql.status output on given target node
12 *
13 * @param env Salt Connection object or pepperEnv
14 * @param target Targeted node
15 * @param parameters Parameters to be retruned (String or list of Strings). If no parameters are provided or is set to '[]', it returns all of them.
16 * @return result List of parameters with its values
17 */
18
19def getWsrepParameters(env, target, parameters=[], print=false) {
Martin Polreich1281cde2019-02-28 11:39:49 +010020 def salt = new com.mirantis.mk.Salt()
21 def common = new com.mirantis.mk.Common()
22 result = [:]
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010023 out = salt.runSaltProcessStep(env, "${target}", "mysql.status", [], null, false)
24 outlist = out['return'][0]
25 resultYaml = outlist.get(outlist.keySet()[0]).sort()
26 if (print) {
27 common.prettyPrint(resultYaml)
28 }
29 if (parameters instanceof String) {
Martin Polreich1fbda522019-02-26 14:46:33 +010030 parameters = [parameters]
31 }
32 if (parameters == [] || parameters == ['']) {
33 result = resultYaml
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010034 } else {
Martin Polreich1281cde2019-02-28 11:39:49 +010035 for (String param in parameters) {
36 value = resultYaml[param]
Martin Polreich1fbda522019-02-26 14:46:33 +010037 if (value instanceof String && value.isBigDecimal()) {
38 value = value.toBigDecimal()
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010039 }
Martin Polreich1281cde2019-02-28 11:39:49 +010040 result[param] = value
Martin Polreich1fbda522019-02-26 14:46:33 +010041 }
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010042 }
43 return result
44}
45
46/**
47 * Verifies Galera database
48 *
49 * This function checks for Galera master, tests connection and if reachable, it obtains the result
50 * of Salt mysql.status function. The result is then parsed, validated and outputed to the user.
51 *
52 * @param env Salt Connection object or pepperEnv
53 * @param slave Boolean value to enable slave checking (if master in unreachable)
54 * @param checkTimeSync Boolean value to enable time sync check
55 * @return resultCode int values used to determine exit status in the calling function
56 */
57def verifyGaleraStatus(env, slave=false, checkTimeSync=false) {
58 def salt = new com.mirantis.mk.Salt()
59 def common = new com.mirantis.mk.Common()
60 def out = ""
61 def status = "unknown"
62 def testNode = ""
63 if (!slave) {
64 try {
65 galeraMaster = salt.getMinions(env, "I@galera:master")
66 common.infoMsg("Current Galera master is: ${galeraMaster}")
67 salt.minionsReachable(env, "I@salt:master", "I@galera:master")
68 testNode = "I@galera:master"
69 } catch (Exception e) {
70 common.errorMsg('Galera master is not reachable.')
Martin Polreich71307db2019-04-03 09:14:51 +020071 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010072 return 128
73 }
74 } else {
75 try {
76 galeraSlaves = salt.getMinions(env, "I@galera:slave")
77 common.infoMsg("Testing Galera slave minions: ${galeraSlaves}")
78 } catch (Exception e) {
79 common.errorMsg("Cannot obtain Galera slave minions list.")
Martin Polreich71307db2019-04-03 09:14:51 +020080 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010081 return 129
82 }
Martin Polreich5df75782019-02-19 14:29:24 +010083 for (minion in galeraSlaves) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010084 try {
Martin Polreich5df75782019-02-19 14:29:24 +010085 salt.minionsReachable(env, "I@salt:master", minion)
86 testNode = minion
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010087 break
88 } catch (Exception e) {
Martin Polreich5df75782019-02-19 14:29:24 +010089 common.warningMsg("Slave '${minion}' is not reachable.")
Martin Polreich8f0f3ac2019-02-15 10:03:33 +010090 }
91 }
92 }
93 if (!testNode) {
94 common.errorMsg("No Galera slave was reachable.")
95 return 130
96 }
Martin Polreich047257f2019-05-02 11:36:57 +020097 def checkTargets = salt.getMinions(env, "I@xtrabackup:client or I@xtrabackup:server")
Martin Polreichda7ff402019-04-02 14:12:58 +020098 for (checkTarget in checkTargets) {
Martin Polreich047257f2019-05-02 11:36:57 +020099 def nodeStatus = salt.minionsReachable(env, 'I@salt:master', checkTarget, null, 10, 5)
100 if (nodeStatus != null) {
101 def iostatRes = salt.getIostatValues(['saltId': env, 'target': checkTarget, 'parameterName': "%util", 'output': true])
102 if (iostatRes == [:]) {
103 common.errorMsg("Recevived empty response from iostat call on ${checkTarget}. Maybe 'sysstat' package is not installed?")
104 return 140
105 }
106 for (int i = 0; i < iostatRes.size(); i++) {
107 def diskKey = iostatRes.keySet()[i]
Martin Polreich74f51752019-07-08 12:06:04 +0200108 if (!(iostatRes[diskKey].toString().isBigDecimal() && (iostatRes[diskKey].toBigDecimal() < 50 ))) {
109 common.errorMsg("Disk ${diskKey} has to high i/o utilization. Maximum value is 50 and current value is ${iostatRes[diskKey]}.")
Martin Polreich047257f2019-05-02 11:36:57 +0200110 return 141
111 }
Martin Polreichda7ff402019-04-02 14:12:58 +0200112 }
113 }
114 }
115 common.infoMsg("Disk i/o utilization was checked and everything seems to be in order.")
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100116 if (checkTimeSync && !salt.checkClusterTimeSync(env, "I@galera:master or I@galera:slave")) {
117 common.errorMsg("Time in cluster is desynchronized or it couldn't be detemined. You should fix this issue manually before proceeding.")
118 return 131
119 }
120 try {
121 out = salt.runSaltProcessStep(env, "${testNode}", "mysql.status", [], null, false)
122 } catch (Exception e) {
123 common.errorMsg('Could not determine mysql status.')
Martin Polreich71307db2019-04-03 09:14:51 +0200124 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100125 return 256
126 }
127 if (out) {
128 try {
129 status = validateAndPrintGaleraStatusReport(env, out, testNode)
130 } catch (Exception e) {
131 common.errorMsg('Could not parse the mysql status output. Check it manually.')
Martin Polreich71307db2019-04-03 09:14:51 +0200132 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100133 return 1
134 }
135 } else {
136 common.errorMsg("Mysql status response unrecognized or is empty. Response: ${out}")
137 return 1024
138 }
139 if (status == "OK") {
140 common.infoMsg("No errors found - MySQL status is ${status}.")
141 return 0
142 } else if (status == "unknown") {
143 common.warningMsg('MySQL status cannot be detemined')
144 return 1
145 } else {
146 common.errorMsg("Errors found.")
147 return 2
148 }
149}
150
151/** Validates and prints result of verifyGaleraStatus function
152@param env Salt Connection object or pepperEnv
153@param out Output of the mysql.status Salt function
154@return status "OK", "ERROR" or "uknown" depending on result of validation
155*/
156
157def validateAndPrintGaleraStatusReport(env, out, minion) {
158 def salt = new com.mirantis.mk.Salt()
159 def common = new com.mirantis.mk.Common()
160 if (minion == "I@galera:master") {
161 role = "master"
162 } else {
163 role = "slave"
164 }
165 sizeOut = salt.getReturnValues(salt.getPillar(env, minion, "galera:${role}:members"))
166 expected_cluster_size = sizeOut.size()
167 outlist = out['return'][0]
168 resultYaml = outlist.get(outlist.keySet()[0]).sort()
169 common.prettyPrint(resultYaml)
170 parameters = [
171 wsrep_cluster_status: [title: 'Cluster status', expectedValues: ['Primary'], description: ''],
172 wsrep_cluster_size: [title: 'Current cluster size', expectedValues: [expected_cluster_size], description: ''],
173 wsrep_ready: [title: 'Node status', expectedValues: ['ON', true], description: ''],
174 wsrep_local_state_comment: [title: 'Node status comment', expectedValues: ['Joining', 'Waiting on SST', 'Joined', 'Synced', 'Donor'], description: ''],
175 wsrep_connected: [title: 'Node connectivity', expectedValues: ['ON', true], description: ''],
176 wsrep_local_recv_queue_avg: [title: 'Average size of local reveived queue', expectedThreshold: [warn: 0.5, error: 1.0], description: '(Value above 0 means that the node cannot apply write-sets as fast as it receives them, which can lead to replication throttling)'],
177 wsrep_local_send_queue_avg: [title: 'Average size of local send queue', expectedThreshold: [warn: 0.5, error: 1.0], description: '(Value above 0 indicate replication throttling or network throughput issues, such as a bottleneck on the network link.)']
178 ]
179 for (key in parameters.keySet()) {
180 value = resultYaml[key]
181 if (value instanceof String && value.isBigDecimal()) {
182 value = value.toBigDecimal()
183 }
184 parameters.get(key) << [actualValue: value]
185 }
186 for (key in parameters.keySet()) {
187 param = parameters.get(key)
188 if (key == 'wsrep_local_recv_queue_avg' || key == 'wsrep_local_send_queue_avg') {
Martin Polreich79810262019-02-25 12:51:11 +0100189 if (param.get('actualValue') == null || (param.get('actualValue') > param.get('expectedThreshold').get('error'))) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100190 param << [match: 'error']
191 } else if (param.get('actualValue') > param.get('expectedThreshold').get('warn')) {
192 param << [match: 'warn']
193 } else {
194 param << [match: 'ok']
195 }
196 } else {
197 for (expValue in param.get('expectedValues')) {
198 if (expValue == param.get('actualValue')) {
199 param << [match: 'ok']
200 break
201 } else {
202 param << [match: 'error']
203 }
204 }
205 }
206 }
207 cluster_info_report = []
208 cluster_warning_report = []
209 cluster_error_report = []
210 for (key in parameters.keySet()) {
211 param = parameters.get(key)
212 if (param.containsKey('expectedThreshold')) {
213 expValues = "below ${param.get('expectedThreshold').get('warn')}"
214 } else {
215 if (param.get('expectedValues').size() > 1) {
216 expValues = param.get('expectedValues').join(' or ')
217 } else {
218 expValues = param.get('expectedValues')[0]
219 }
220 }
221 reportString = "${param.title}: ${param.actualValue} (Expected: ${expValues}) ${param.description}"
222 if (param.get('match').equals('ok')) {
223 cluster_info_report.add("[OK ] ${reportString}")
224 } else if (param.get('match').equals('warn')) {
225 cluster_warning_report.add("[WARNING] ${reportString}")
226 } else {
227 cluster_error_report.add("[ ERROR] ${reportString})")
228 }
229 }
230 common.infoMsg("CLUSTER STATUS REPORT: ${cluster_info_report.size()} expected values, ${cluster_warning_report.size()} warnings and ${cluster_error_report.size()} error found:")
231 if (cluster_info_report.size() > 0) {
232 common.infoMsg(cluster_info_report.join('\n'))
233 }
234 if (cluster_warning_report.size() > 0) {
235 common.warningMsg(cluster_warning_report.join('\n'))
236 }
237 if (cluster_error_report.size() > 0) {
238 common.errorMsg(cluster_error_report.join('\n'))
239 return "ERROR"
240 } else {
241 return "OK"
242 }
243}
244
Martin Polreich1281cde2019-02-28 11:39:49 +0100245/** Returns last shutdown node of Galera cluster
246@param env Salt Connection object or pepperEnv
247@param nodes List of nodes to check only (defaults to []). If not provided, it will check all nodes.
248 Use this parameter if the cluster splits to several components and you only want to check one fo them.
249@return status ip address or hostname of last shutdown node
250*/
251
252def getGaleraLastShutdownNode(env, nodes = []) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100253 def salt = new com.mirantis.mk.Salt()
254 def common = new com.mirantis.mk.Common()
Martin Polreich1281cde2019-02-28 11:39:49 +0100255 members = []
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100256 lastNode = [ip: '', seqno: -2]
257 try {
Martin Polreich1281cde2019-02-28 11:39:49 +0100258 if (nodes) {
259 nodes = salt.getIPAddressesForNodenames(env, nodes)
260 for (node in nodes) {
261 members = [host: "${node.get(node.keySet()[0])}"] + members
262 }
263 } else {
264 members = salt.getReturnValues(salt.getPillar(env, "I@galera:master", "galera:master:members"))
265 }
Martin Polreich71307db2019-04-03 09:14:51 +0200266 } catch (Exception e) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100267 common.errorMsg('Could not retrieve members list')
Martin Polreich71307db2019-04-03 09:14:51 +0200268 common.errorMsg(e.getMessage())
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100269 return 'I@galera:master'
270 }
271 if (members) {
272 for (member in members) {
273 try {
274 salt.minionsReachable(env, 'I@salt:master', "S@${member.host}")
275 out = salt.getReturnValues(salt.cmdRun(env, "S@${member.host}", 'cat /var/lib/mysql/grastate.dat | grep "seqno" | cut -d ":" -f2', true, null, false))
276 seqno = out.tokenize('\n')[0].trim()
277 if (seqno.isNumber()) {
278 seqno = seqno.toInteger()
279 } else {
Denis Egorenko8a339982019-08-09 18:22:30 +0400280 // in case if /var/lib/mysql/grastate.dat has no any seqno - set it to 0
281 // thus node will be recovered if no other failed found
282 seqno = 0
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100283 }
Martin Polreich71307db2019-04-03 09:14:51 +0200284 } catch (Exception e) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100285 common.warningMsg("Could not determine 'seqno' value for node ${member.host} ")
Martin Polreich71307db2019-04-03 09:14:51 +0200286 common.warningMsg(e.getMessage())
Denis Egorenko8a339982019-08-09 18:22:30 +0400287 seqno = 0
288 }
289 highestSeqno = lastNode.get('seqno')
290 if (seqno > highestSeqno) {
291 lastNode << [ip: "${member.host}", seqno: seqno]
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100292 }
293 }
294 }
295 if (lastNode.get('ip') != '') {
296 return "S@${lastNode.ip}"
297 } else {
298 return "I@galera:master"
299 }
300}
301
302/**
Denis Egorenko8a339982019-08-09 18:22:30 +0400303 * Wrapper around Mysql systemd service
304 * @param env Salt Connection object or pepperEnv
305 * @param targetNode Node to apply changes
306 * @param checkStatus Whether to check status of Mysql
307 * @param checkState State of service to check
308*/
309def manageServiceMysql(env, targetNode, action, checkStatus=true, checkState='running') {
310 def salt = new com.mirantis.mk.Salt()
311 salt.runSaltProcessStep(env, lastNodeTarget, "service.${action}", ['mysql'])
312 if (checkStatus) {
313 try {
314 salt.commandStatus(env, lastNodeTarget, 'service mysql status', checkState)
315 } catch (Exception er) {
316 input message: "Database is not running please fix it first and only then click on PROCEED."
317 }
318 }
319}
320
321/**
Martin Polreich9044fe42019-03-21 16:00:23 +0100322 * Restores Galera cluster
323 * @param env Salt Connection object or pepperEnv
324 * @param runRestoreDb Boolean to determine if the restoration of DB should be run as well
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100325 * @return output of salt commands
326 */
Martin Polreich9044fe42019-03-21 16:00:23 +0100327def restoreGaleraCluster(env, runRestoreDb=true) {
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100328 def salt = new com.mirantis.mk.Salt()
329 def common = new com.mirantis.mk.Common()
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100330 lastNodeTarget = getGaleraLastShutdownNode(env)
Denis Egorenko8a339982019-08-09 18:22:30 +0400331 manageServiceMysql(env, lastNodeTarget, 'stop', false)
Martin Polreich7ddaea72019-08-06 16:51:27 +0200332 if (runRestoreDb) {
333 salt.cmdRun(env, lastNodeTarget, "mkdir -p /root/mysql/mysql.bak")
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100334 salt.cmdRun(env, lastNodeTarget, "rm -rf /root/mysql/mysql.bak/*")
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100335 salt.cmdRun(env, lastNodeTarget, "mv /var/lib/mysql/* /root/mysql/mysql.bak")
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100336 }
Martin Polreich7ddaea72019-08-06 16:51:27 +0200337 salt.cmdRun(env, lastNodeTarget, "rm -f /var/lib/mysql/.galera_bootstrap")
Martin Polreich9044fe42019-03-21 16:00:23 +0100338
Martin Polreich1949d0c2019-07-08 11:12:58 +0200339 // make sure that gcom parameter is empty
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100340 salt.cmdRun(env, lastNodeTarget, "sed -i '/gcomm/c\\wsrep_cluster_address=\"gcomm://\"' /etc/mysql/my.cnf")
Martin Polreich9044fe42019-03-21 16:00:23 +0100341
Martin Polreich1949d0c2019-07-08 11:12:58 +0200342 // run restore of DB
Martin Polreich9044fe42019-03-21 16:00:23 +0100343 if (runRestoreDb) {
344 restoreGaleraDb(env, lastNodeTarget)
345 }
346
Denis Egorenko8a339982019-08-09 18:22:30 +0400347 manageServiceMysql(env, lastNodeTarget, 'start')
Martin Polreich9044fe42019-03-21 16:00:23 +0100348
Denis Egorenko8a339982019-08-09 18:22:30 +0400349 // apply any changes in configuration and return value to gcom parameter and then restart mysql to catch
Martin Polreich7ddaea72019-08-06 16:51:27 +0200350 salt.enforceState(['saltId': env, 'target': lastNodeTarget, 'state': 'galera'])
Denis Egorenko8a339982019-08-09 18:22:30 +0400351 manageServiceMysql(env, lastNodeTarget, 'restart')
Martin Polreich8f0f3ac2019-02-15 10:03:33 +0100352}
Martin Polreich9044fe42019-03-21 16:00:23 +0100353
354/**
355 * Restores Galera database
356 * @param env Salt Connection object or pepperEnv
357 * @param targetNode Node to be targeted
358 */
359def restoreGaleraDb(env, targetNode) {
Martin Polreichef778c32019-08-08 13:47:45 +0200360 def salt = new com.mirantis.mk.Salt()
Martin Polreich9044fe42019-03-21 16:00:23 +0100361 def backup_dir = salt.getReturnValues(salt.getPillar(env, targetNode, 'xtrabackup:client:backup_dir'))
362 if(backup_dir == null || backup_dir.isEmpty()) { backup_dir='/var/backups/mysql/xtrabackup' }
363 salt.runSaltProcessStep(env, targetNode, 'file.remove', ["${backup_dir}/dbrestored"])
Martin Polreichef778c32019-08-08 13:47:45 +0200364 salt.enforceState(['saltId': env, 'target': targetNode, 'state': 'xtrabackup.client'])
365 salt.enforceState(['saltId': env, 'target': targetNode, 'state': 'xtrabackup.client.restore'])
Martin Polreich9044fe42019-03-21 16:00:23 +0100366}
367
368def restoreGaleraDb(env) {
Martin Polreichef778c32019-08-08 13:47:45 +0200369 def common = new com.mirantis.mk.Common()
Martin Polreich9044fe42019-03-21 16:00:23 +0100370 common.warningMsg("This method was renamed to 'restoreGaleraCluster'. Please change your pipeline to use this call instead! If you think that you really wanted to call 'restoreGaleraDb' you may be missing 'targetNode' parameter in you call.")
371 return restoreGaleraCluster(env)
sgarbuz5e10ba22019-07-17 12:01:57 +0300372}
373
374/**
375 * Start first node in mysql cluster. Cluster members stay removed in mysql config, additional service restart will be needed once all nodes are up.
376 * https://docs.mirantis.com/mcp/q4-18/mcp-operations-guide/tshooting/
377 * tshoot-mcp-openstack/tshoot-galera/restore-galera-cluster/
378 * restore-galera-manually.html#restore-galera-manually
379 *
380 * @param env Salt Connection object or pepperEnv
381 * @param target last stopped Galera node
382 * @return output of salt commands
383 */
384def startFirstNode(env, target) {
385 def salt = new com.mirantis.mk.Salt()
386 def common = new com.mirantis.mk.Common()
387
388 // make sure that gcom parameter is empty
389 salt.cmdRun(env, target, "sed -i '/wsrep_cluster_address/ s/^#*/#/' /etc/mysql/my.cnf")
390 salt.cmdRun(env, target, "sed -i '/wsrep_cluster_address/a wsrep_cluster_address=\"gcomm://\"' /etc/mysql/my.cnf")
391
392 // start mysql service on the last node
393 salt.runSaltProcessStep(env, target, 'service.start', ['mysql'])
394
395 // wait until mysql service on the last node is up
396
397 common.retry(30, 10) {
398 value = getWsrepParameters(env, target, 'wsrep_evs_state')
399 if (value['wsrep_evs_state'] == 'OPERATIONAL') {
400 common.infoMsg('WSREP state: OPERATIONAL')
401 } else {
402 throw new Exception("Mysql service is not running please fix it.")
403 }
404 }
Martin Polreich9044fe42019-03-21 16:00:23 +0100405}