blob: d5cdc6e0b77f73fb82414063a8027f1c9eea04e6 [file] [log] [blame]
Martin Polreichf89f9b42019-05-07 15:37:13 +02001package com.mirantis.mk
2
3/**
4 *
5 * Galera functions
6 *
7 */
8
9
10/**
11 * Returns parameters from mysql.status output on given target node
12 *
13 * @param env Salt Connection object or pepperEnv
14 * @param target Targeted node
15 * @param parameters Parameters to be retruned (String or list of Strings). If no parameters are provided or is set to '[]', it returns all of them.
16 * @return result List of parameters with its values
17 */
18
19def getWsrepParameters(env, target, parameters=[], print=false) {
Martin Polreichac8bcce2019-07-19 13:41:12 +020020 def salt = new com.mirantis.mk.Salt()
21 def common = new com.mirantis.mk.Common()
22 result = [:]
Martin Polreichf89f9b42019-05-07 15:37:13 +020023 out = salt.runSaltProcessStep(env, "${target}", "mysql.status", [], null, false)
24 outlist = out['return'][0]
25 resultYaml = outlist.get(outlist.keySet()[0]).sort()
26 if (print) {
27 common.prettyPrint(resultYaml)
28 }
29 if (parameters instanceof String) {
Martin Polreichac8bcce2019-07-19 13:41:12 +020030 parameters = [parameters]
31 }
32 if (parameters == [] || parameters == ['']) {
33 result = resultYaml
Martin Polreichf89f9b42019-05-07 15:37:13 +020034 } else {
Martin Polreichac8bcce2019-07-19 13:41:12 +020035 for (String param in parameters) {
36 value = resultYaml[param]
37 if (value instanceof String && value.isBigDecimal()) {
38 value = value.toBigDecimal()
Martin Polreichf89f9b42019-05-07 15:37:13 +020039 }
Martin Polreichac8bcce2019-07-19 13:41:12 +020040 result[param] = value
41 }
Martin Polreichf89f9b42019-05-07 15:37:13 +020042 }
43 return result
44}
45
46/**
47 * Verifies Galera database
48 *
49 * This function checks for Galera master, tests connection and if reachable, it obtains the result
50 * of Salt mysql.status function. The result is then parsed, validated and outputed to the user.
51 *
52 * @param env Salt Connection object or pepperEnv
53 * @param slave Boolean value to enable slave checking (if master in unreachable)
54 * @param checkTimeSync Boolean value to enable time sync check
55 * @return resultCode int values used to determine exit status in the calling function
56 */
57def verifyGaleraStatus(env, slave=false, checkTimeSync=false) {
58 def salt = new com.mirantis.mk.Salt()
59 def common = new com.mirantis.mk.Common()
60 def out = ""
61 def status = "unknown"
62 def testNode = ""
63 if (!slave) {
64 try {
65 galeraMaster = salt.getMinions(env, "I@galera:master")
66 common.infoMsg("Current Galera master is: ${galeraMaster}")
67 salt.minionsReachable(env, "I@salt:master", "I@galera:master")
68 testNode = "I@galera:master"
69 } catch (Exception e) {
70 common.errorMsg('Galera master is not reachable.')
Martin Polreich7c8ac9a2019-05-16 13:41:09 +020071 common.errorMsg(e.getMessage())
Martin Polreichf89f9b42019-05-07 15:37:13 +020072 return 128
73 }
74 } else {
75 try {
76 galeraSlaves = salt.getMinions(env, "I@galera:slave")
77 common.infoMsg("Testing Galera slave minions: ${galeraSlaves}")
78 } catch (Exception e) {
79 common.errorMsg("Cannot obtain Galera slave minions list.")
Martin Polreich7c8ac9a2019-05-16 13:41:09 +020080 common.errorMsg(e.getMessage())
Martin Polreichf89f9b42019-05-07 15:37:13 +020081 return 129
82 }
83 for (minion in galeraSlaves) {
84 try {
85 salt.minionsReachable(env, "I@salt:master", minion)
86 testNode = minion
87 break
88 } catch (Exception e) {
89 common.warningMsg("Slave '${minion}' is not reachable.")
90 }
91 }
92 }
93 if (!testNode) {
94 common.errorMsg("No Galera slave was reachable.")
95 return 130
96 }
Martin Polreichf48bb102019-04-02 14:12:58 +020097 def checkTargets = salt.getMinions(env, "I@xtrabackup:client or I@xtrabackup:server")
98 for (checkTarget in checkTargets) {
99 def nodeStatus = salt.minionsReachable(env, 'I@salt:master', checkTarget, null, 10, 5)
100 if (nodeStatus != null) {
101 def iostatRes = salt.getIostatValues(['saltId': env, 'target': checkTarget, 'parameterName': "%util", 'output': true])
102 if (iostatRes == [:]) {
103 common.errorMsg("Recevived empty response from iostat call on ${checkTarget}. Maybe 'sysstat' package is not installed?")
104 return 140
105 }
106 for (int i = 0; i < iostatRes.size(); i++) {
107 def diskKey = iostatRes.keySet()[i]
108 if (!(iostatRes[diskKey].toString().isBigDecimal() && (iostatRes[diskKey].toBigDecimal() < 50 ))) {
109 common.errorMsg("Disk ${diskKey} has to high i/o utilization. Maximum value is 50 and current value is ${iostatRes[diskKey]}.")
110 return 141
111 }
112 }
113 }
114 }
115 common.infoMsg("Disk i/o utilization was checked and everything seems to be in order.")
Martin Polreichf89f9b42019-05-07 15:37:13 +0200116 if (checkTimeSync && !salt.checkClusterTimeSync(env, "I@galera:master or I@galera:slave")) {
117 common.errorMsg("Time in cluster is desynchronized or it couldn't be detemined. You should fix this issue manually before proceeding.")
118 return 131
119 }
120 try {
121 out = salt.runSaltProcessStep(env, "${testNode}", "mysql.status", [], null, false)
122 } catch (Exception e) {
123 common.errorMsg('Could not determine mysql status.')
Martin Polreich7c8ac9a2019-05-16 13:41:09 +0200124 common.errorMsg(e.getMessage())
Martin Polreichf89f9b42019-05-07 15:37:13 +0200125 return 256
126 }
127 if (out) {
128 try {
129 status = validateAndPrintGaleraStatusReport(env, out, testNode)
130 } catch (Exception e) {
131 common.errorMsg('Could not parse the mysql status output. Check it manually.')
Martin Polreich7c8ac9a2019-05-16 13:41:09 +0200132 common.errorMsg(e.getMessage())
Martin Polreichf89f9b42019-05-07 15:37:13 +0200133 return 1
134 }
135 } else {
136 common.errorMsg("Mysql status response unrecognized or is empty. Response: ${out}")
137 return 1024
138 }
139 if (status == "OK") {
140 common.infoMsg("No errors found - MySQL status is ${status}.")
141 return 0
142 } else if (status == "unknown") {
143 common.warningMsg('MySQL status cannot be detemined')
144 return 1
145 } else {
146 common.errorMsg("Errors found.")
147 return 2
148 }
149}
150
151/** Validates and prints result of verifyGaleraStatus function
152@param env Salt Connection object or pepperEnv
153@param out Output of the mysql.status Salt function
154@return status "OK", "ERROR" or "uknown" depending on result of validation
155*/
156
157def validateAndPrintGaleraStatusReport(env, out, minion) {
158 def salt = new com.mirantis.mk.Salt()
159 def common = new com.mirantis.mk.Common()
160 if (minion == "I@galera:master") {
161 role = "master"
162 } else {
163 role = "slave"
164 }
165 sizeOut = salt.getReturnValues(salt.getPillar(env, minion, "galera:${role}:members"))
166 expected_cluster_size = sizeOut.size()
167 outlist = out['return'][0]
168 resultYaml = outlist.get(outlist.keySet()[0]).sort()
169 common.prettyPrint(resultYaml)
170 parameters = [
171 wsrep_cluster_status: [title: 'Cluster status', expectedValues: ['Primary'], description: ''],
172 wsrep_cluster_size: [title: 'Current cluster size', expectedValues: [expected_cluster_size], description: ''],
173 wsrep_ready: [title: 'Node status', expectedValues: ['ON', true], description: ''],
174 wsrep_local_state_comment: [title: 'Node status comment', expectedValues: ['Joining', 'Waiting on SST', 'Joined', 'Synced', 'Donor'], description: ''],
175 wsrep_connected: [title: 'Node connectivity', expectedValues: ['ON', true], description: ''],
176 wsrep_local_recv_queue_avg: [title: 'Average size of local reveived queue', expectedThreshold: [warn: 0.5, error: 1.0], description: '(Value above 0 means that the node cannot apply write-sets as fast as it receives them, which can lead to replication throttling)'],
177 wsrep_local_send_queue_avg: [title: 'Average size of local send queue', expectedThreshold: [warn: 0.5, error: 1.0], description: '(Value above 0 indicate replication throttling or network throughput issues, such as a bottleneck on the network link.)']
178 ]
179 for (key in parameters.keySet()) {
180 value = resultYaml[key]
181 if (value instanceof String && value.isBigDecimal()) {
182 value = value.toBigDecimal()
183 }
184 parameters.get(key) << [actualValue: value]
185 }
186 for (key in parameters.keySet()) {
187 param = parameters.get(key)
188 if (key == 'wsrep_local_recv_queue_avg' || key == 'wsrep_local_send_queue_avg') {
Martin Polreichfb026be2019-05-16 13:36:23 +0200189 if (param.get('actualValue') == null || (param.get('actualValue') > param.get('expectedThreshold').get('error'))) {
Martin Polreichf89f9b42019-05-07 15:37:13 +0200190 param << [match: 'error']
191 } else if (param.get('actualValue') > param.get('expectedThreshold').get('warn')) {
192 param << [match: 'warn']
193 } else {
194 param << [match: 'ok']
195 }
196 } else {
197 for (expValue in param.get('expectedValues')) {
198 if (expValue == param.get('actualValue')) {
199 param << [match: 'ok']
200 break
201 } else {
202 param << [match: 'error']
203 }
204 }
205 }
206 }
207 cluster_info_report = []
208 cluster_warning_report = []
209 cluster_error_report = []
210 for (key in parameters.keySet()) {
211 param = parameters.get(key)
212 if (param.containsKey('expectedThreshold')) {
213 expValues = "below ${param.get('expectedThreshold').get('warn')}"
214 } else {
215 if (param.get('expectedValues').size() > 1) {
216 expValues = param.get('expectedValues').join(' or ')
217 } else {
218 expValues = param.get('expectedValues')[0]
219 }
220 }
221 reportString = "${param.title}: ${param.actualValue} (Expected: ${expValues}) ${param.description}"
222 if (param.get('match').equals('ok')) {
223 cluster_info_report.add("[OK ] ${reportString}")
224 } else if (param.get('match').equals('warn')) {
225 cluster_warning_report.add("[WARNING] ${reportString}")
226 } else {
227 cluster_error_report.add("[ ERROR] ${reportString})")
228 }
229 }
230 common.infoMsg("CLUSTER STATUS REPORT: ${cluster_info_report.size()} expected values, ${cluster_warning_report.size()} warnings and ${cluster_error_report.size()} error found:")
231 if (cluster_info_report.size() > 0) {
232 common.infoMsg(cluster_info_report.join('\n'))
233 }
234 if (cluster_warning_report.size() > 0) {
235 common.warningMsg(cluster_warning_report.join('\n'))
236 }
237 if (cluster_error_report.size() > 0) {
238 common.errorMsg(cluster_error_report.join('\n'))
239 return "ERROR"
240 } else {
241 return "OK"
242 }
243}
244
Martin Polreichac8bcce2019-07-19 13:41:12 +0200245/** Returns last shutdown node of Galera cluster
246@param env Salt Connection object or pepperEnv
247@param nodes List of nodes to check only (defaults to []). If not provided, it will check all nodes.
248 Use this parameter if the cluster splits to several components and you only want to check one fo them.
249@return status ip address or hostname of last shutdown node
250*/
251
252def getGaleraLastShutdownNode(env, nodes = []) {
Martin Polreichf89f9b42019-05-07 15:37:13 +0200253 def salt = new com.mirantis.mk.Salt()
254 def common = new com.mirantis.mk.Common()
Martin Polreichac8bcce2019-07-19 13:41:12 +0200255 members = []
Martin Polreichf89f9b42019-05-07 15:37:13 +0200256 lastNode = [ip: '', seqno: -2]
257 try {
Martin Polreichac8bcce2019-07-19 13:41:12 +0200258 if (nodes) {
259 nodes = salt.getIPAddressesForNodenames(env, nodes)
260 for (node in nodes) {
261 members = [host: "${node.get(node.keySet()[0])}"] + members
262 }
263 } else {
264 members = salt.getReturnValues(salt.getPillar(env, "I@galera:master", "galera:master:members"))
265 }
Martin Polreich7c8ac9a2019-05-16 13:41:09 +0200266 } catch (Exception e) {
Martin Polreichf89f9b42019-05-07 15:37:13 +0200267 common.errorMsg('Could not retrieve members list')
Martin Polreich7c8ac9a2019-05-16 13:41:09 +0200268 common.errorMsg(e.getMessage())
Martin Polreichf89f9b42019-05-07 15:37:13 +0200269 return 'I@galera:master'
270 }
271 if (members) {
272 for (member in members) {
273 try {
274 salt.minionsReachable(env, 'I@salt:master', "S@${member.host}")
275 out = salt.getReturnValues(salt.cmdRun(env, "S@${member.host}", 'cat /var/lib/mysql/grastate.dat | grep "seqno" | cut -d ":" -f2', true, null, false))
276 seqno = out.tokenize('\n')[0].trim()
277 if (seqno.isNumber()) {
278 seqno = seqno.toInteger()
279 } else {
280 seqno = -2
281 }
282 highestSeqno = lastNode.get('seqno')
283 if (seqno > highestSeqno) {
284 lastNode << [ip: "${member.host}", seqno: seqno]
285 }
Martin Polreich7c8ac9a2019-05-16 13:41:09 +0200286 } catch (Exception e) {
Martin Polreichf89f9b42019-05-07 15:37:13 +0200287 common.warningMsg("Could not determine 'seqno' value for node ${member.host} ")
Martin Polreich7c8ac9a2019-05-16 13:41:09 +0200288 common.warningMsg(e.getMessage())
Martin Polreichf89f9b42019-05-07 15:37:13 +0200289 }
290 }
291 }
292 if (lastNode.get('ip') != '') {
293 return "S@${lastNode.ip}"
294 } else {
295 return "I@galera:master"
296 }
297}
298
299/**
Martin Polreiche48741b2019-03-21 16:00:23 +0100300 * Restores Galera cluster
301 * @param env Salt Connection object or pepperEnv
302 * @param runRestoreDb Boolean to determine if the restoration of DB should be run as well
Martin Polreichf89f9b42019-05-07 15:37:13 +0200303 * @return output of salt commands
304 */
Martin Polreiche48741b2019-03-21 16:00:23 +0100305def restoreGaleraCluster(env, runRestoreDb=true) {
Martin Polreichf89f9b42019-05-07 15:37:13 +0200306 def salt = new com.mirantis.mk.Salt()
307 def common = new com.mirantis.mk.Common()
Martin Polreich8a9e5932019-08-06 16:51:27 +0200308 salt.runSaltProcessStep(env, 'I@galera:slave', 'service.stop', ['mysql'])
309 salt.runSaltProcessStep(env, 'I@galera:master', 'service.stop', ['mysql'])
Martin Polreichf89f9b42019-05-07 15:37:13 +0200310 lastNodeTarget = getGaleraLastShutdownNode(env)
Martin Polreich8a9e5932019-08-06 16:51:27 +0200311 salt.cmdRun(env, "( I@galera:master or I@galera:slave ) and not ${lastNodeTarget}", "rm -f /var/lib/mysql/ib_logfile*")
312 salt.cmdRun(env, "( I@galera:master or I@galera:slave ) and not ${lastNodeTarget}", "rm -f /var/lib/mysql/grastate.dat")
313 if (runRestoreDb) {
314 salt.cmdRun(env, lastNodeTarget, "mkdir -p /root/mysql/mysql.bak")
Martin Polreichf89f9b42019-05-07 15:37:13 +0200315 salt.cmdRun(env, lastNodeTarget, "rm -rf /root/mysql/mysql.bak/*")
Martin Polreichf89f9b42019-05-07 15:37:13 +0200316 salt.cmdRun(env, lastNodeTarget, "mv /var/lib/mysql/* /root/mysql/mysql.bak")
Martin Polreichf89f9b42019-05-07 15:37:13 +0200317 }
Martin Polreich8a9e5932019-08-06 16:51:27 +0200318 salt.cmdRun(env, lastNodeTarget, "rm -f /var/lib/mysql/.galera_bootstrap")
Martin Polreiche48741b2019-03-21 16:00:23 +0100319
320 // make sure that gcom parameter is empty
Martin Polreichf89f9b42019-05-07 15:37:13 +0200321 salt.cmdRun(env, lastNodeTarget, "sed -i '/gcomm/c\\wsrep_cluster_address=\"gcomm://\"' /etc/mysql/my.cnf")
Martin Polreiche48741b2019-03-21 16:00:23 +0100322
323 // run restore of DB
324 if (runRestoreDb) {
325 restoreGaleraDb(env, lastNodeTarget)
326 }
327
328 // start mysql service on the last node
Martin Polreichf89f9b42019-05-07 15:37:13 +0200329 salt.runSaltProcessStep(env, lastNodeTarget, 'service.start', ['mysql'])
330
331 // wait until mysql service on galera master is up
332 try {
333 salt.commandStatus(env, lastNodeTarget, 'service mysql status', 'running')
334 } catch (Exception er) {
335 input message: "Database is not running please fix it first and only then click on PROCEED."
336 }
337
Martin Polreiche48741b2019-03-21 16:00:23 +0100338 // start mysql services on the rest of the nodes
Martin Polreichf89f9b42019-05-07 15:37:13 +0200339 salt.runSaltProcessStep(env, "I@galera:master and not ${lastNodeTarget}", 'service.start', ['mysql'])
340 salt.runSaltProcessStep(env, "I@galera:slave and not ${lastNodeTarget}", 'service.start', ['mysql'])
Martin Polreiche48741b2019-03-21 16:00:23 +0100341
342 // wait until mysql service on the rest of the nodes is up
343 try {
344 salt.commandStatus(env, "( I@galera:master or I@galera:slave ) and not ${lastNodeTarget}", 'service mysql status', 'running')
345 } catch (Exception er) {
346 input message: "Database is not running please fix it first and only then click on PROCEED."
347 }
348
Martin Polreich8a9e5932019-08-06 16:51:27 +0200349 // apply any changes in configuration and return value to gcom parameter
350 salt.enforceState(['saltId': env, 'target': lastNodeTarget, 'state': 'galera'])
Martin Polreichf89f9b42019-05-07 15:37:13 +0200351}
Martin Polreiche48741b2019-03-21 16:00:23 +0100352
353/**
354 * Restores Galera database
355 * @param env Salt Connection object or pepperEnv
356 * @param targetNode Node to be targeted
357 */
358def restoreGaleraDb(env, targetNode) {
359 def backup_dir = salt.getReturnValues(salt.getPillar(env, targetNode, 'xtrabackup:client:backup_dir'))
360 if(backup_dir == null || backup_dir.isEmpty()) { backup_dir='/var/backups/mysql/xtrabackup' }
361 salt.runSaltProcessStep(env, targetNode, 'file.remove', ["${backup_dir}/dbrestored"])
Martin Polreich8a9e5932019-08-06 16:51:27 +0200362 salt.cmdRun(env, targetNode, "su root -c 'salt-call state.sls xtrabackup.client'")
Martin Polreiche48741b2019-03-21 16:00:23 +0100363}
364
365def restoreGaleraDb(env) {
366 common.warningMsg("This method was renamed to 'restoreGaleraCluster'. Please change your pipeline to use this call instead! If you think that you really wanted to call 'restoreGaleraDb' you may be missing 'targetNode' parameter in you call.")
367 return restoreGaleraCluster(env)
sgarbuzf2063462019-07-17 12:01:57 +0300368}
369
370/**
371 * Start first node in mysql cluster. Cluster members stay removed in mysql config, additional service restart will be needed once all nodes are up.
372 * https://docs.mirantis.com/mcp/q4-18/mcp-operations-guide/tshooting/
373 * tshoot-mcp-openstack/tshoot-galera/restore-galera-cluster/
374 * restore-galera-manually.html#restore-galera-manually
375 *
376 * @param env Salt Connection object or pepperEnv
377 * @param target last stopped Galera node
378 * @return output of salt commands
379 */
380def startFirstNode(env, target) {
381 def salt = new com.mirantis.mk.Salt()
382 def common = new com.mirantis.mk.Common()
383
384 // make sure that gcom parameter is empty
385 salt.cmdRun(env, target, "sed -i '/wsrep_cluster_address/ s/^#*/#/' /etc/mysql/my.cnf")
386 salt.cmdRun(env, target, "sed -i '/wsrep_cluster_address/a wsrep_cluster_address=\"gcomm://\"' /etc/mysql/my.cnf")
387
388 // start mysql service on the last node
389 salt.runSaltProcessStep(env, target, 'service.start', ['mysql'])
390
391 // wait until mysql service on the last node is up
392
393 common.retry(30, 10) {
394 value = getWsrepParameters(env, target, 'wsrep_evs_state')
395 if (value['wsrep_evs_state'] == 'OPERATIONAL') {
396 common.infoMsg('WSREP state: OPERATIONAL')
397 } else {
398 throw new Exception("Mysql service is not running please fix it.")
399 }
400 }
Martin Polreiche48741b2019-03-21 16:00:23 +0100401}