Update Galera functions for verify/restore
* update function for getting status from any available node,
not only master;
* update function for restoring Galera cluster;
Related: PROD-32619
Change-Id: Ie70aee7ad9aa255a2cdcad1cbbc16c6953779516
(cherry picked from commit 527bf4b74356da652ef6e6a0a71e687fcf8ac688)
diff --git a/src/com/mirantis/mk/Galera.groovy b/src/com/mirantis/mk/Galera.groovy
index 3a10a1c..e5ffe12 100644
--- a/src/com/mirantis/mk/Galera.groovy
+++ b/src/com/mirantis/mk/Galera.groovy
@@ -50,50 +50,44 @@
* of Salt mysql.status function. The result is then parsed, validated and outputed to the user.
*
* @param env Salt Connection object or pepperEnv
- * @param slave Boolean value to enable slave checking (if master in unreachable)
* @param checkTimeSync Boolean value to enable time sync check
* @return resultCode int values used to determine exit status in the calling function
*/
-def verifyGaleraStatus(env, slave=false, checkTimeSync=false) {
+def verifyGaleraStatus(env, checkTimeSync=false) {
def salt = new com.mirantis.mk.Salt()
def common = new com.mirantis.mk.Common()
- def out = ""
- def status = "unknown"
- def testNode = ""
- if (!slave) {
- try {
- galeraMaster = salt.getMinions(env, "I@galera:master")
- common.infoMsg("Current Galera master is: ${galeraMaster}")
- salt.minionsReachable(env, "I@salt:master", "I@galera:master")
- testNode = "I@galera:master"
- } catch (Exception e) {
- common.errorMsg('Galera master is not reachable.')
- common.errorMsg(e.getMessage())
- return 128
- }
- } else {
- try {
- galeraSlaves = salt.getMinions(env, "I@galera:slave")
- common.infoMsg("Testing Galera slave minions: ${galeraSlaves}")
- } catch (Exception e) {
- common.errorMsg("Cannot obtain Galera slave minions list.")
- common.errorMsg(e.getMessage())
- return 129
- }
- for (minion in galeraSlaves) {
+ def mysqlStatusReport = [
+ 'clusterMembersOnPower': [],
+ 'clusterMembersNotAvailable': [],
+ 'clusterMembersInClusterAlive': [],
+ 'clusterMembersNotAlive': [],
+ 'error': 0
+ ]
+
+ try {
+ def clusterMembers = salt.getMinions(env, "I@galera:master or I@galera:slave")
+ for (minion in clusterMembers) {
try {
salt.minionsReachable(env, "I@salt:master", minion)
- testNode = minion
- break
+ mysqlStatusReport['clusterMembersOnPower'] << minion
} catch (Exception e) {
common.warningMsg("Slave '${minion}' is not reachable.")
+ mysqlStatusReport['clusterMembersNotAvailable'] << minion
}
}
+ } catch (Exception e) {
+ common.errorMsg('Cannot obtain Galera minions list.')
+ common.errorMsg(e.getMessage())
+ mysqlStatusReport['error'] = 128
+ return mysqlStatusReport
}
- if (!testNode) {
- common.errorMsg("No Galera slave was reachable.")
- return 130
+
+ if (!mysqlStatusReport['clusterMembersOnPower']) {
+ common.errorMsg("No Galera member was reachable.")
+ mysqlStatusReport['error'] = 130
+ return mysqlStatusReport
}
+
def checkTargets = salt.getMinions(env, "I@xtrabackup:client or I@xtrabackup:server")
for (checkTarget in checkTargets) {
def nodeStatus = salt.minionsReachable(env, 'I@salt:master', checkTarget, null, 10, 5)
@@ -101,13 +95,15 @@
def iostatRes = salt.getIostatValues(['saltId': env, 'target': checkTarget, 'parameterName': "%util", 'output': true])
if (iostatRes == [:]) {
common.errorMsg("Recevived empty response from iostat call on ${checkTarget}. Maybe 'sysstat' package is not installed?")
- return 140
+ mysqlStatusReport['error'] = 140
+ return mysqlStatusReport
}
for (int i = 0; i < iostatRes.size(); i++) {
def diskKey = iostatRes.keySet()[i]
if (!(iostatRes[diskKey].toString().isBigDecimal() && (iostatRes[diskKey].toBigDecimal() < 50 ))) {
common.errorMsg("Disk ${diskKey} has to high i/o utilization. Maximum value is 50 and current value is ${iostatRes[diskKey]}.")
- return 141
+ mysqlStatusReport['error'] = 141
+ return mysqlStatusReport
}
}
}
@@ -115,36 +111,65 @@
common.infoMsg("Disk i/o utilization was checked and everything seems to be in order.")
if (checkTimeSync && !salt.checkClusterTimeSync(env, "I@galera:master or I@galera:slave")) {
common.errorMsg("Time in cluster is desynchronized or it couldn't be detemined. You should fix this issue manually before proceeding.")
- return 131
+ mysqlStatusReport['error'] = 131
+ return mysqlStatusReport
}
+
+ for(member in mysqlStatusReport['clusterMembersOnPower']) {
+ def clusterStatus = getWsrepParameters(env, member, 'wsrep_cluster_status')
+ if (clusterStatus['wsrep_cluster_status']) {
+ mysqlStatusReport['clusterMembersInClusterAlive'] << member
+ } else {
+ mysqlStatusReport['clusterMembersNotAlive'] << member
+ }
+ }
+ if (!mysqlStatusReport['clusterMembersInClusterAlive']) {
+ common.errorMsg("Could not determine mysql status, because all nodes are not connected to cluster.")
+ mysqlStatusReport['error'] = 256
+ return mysqlStatusReport
+ }
+ def testNode = mysqlStatusReport['clusterMembersInClusterAlive'].sort().first()
+
try {
- out = salt.runSaltProcessStep(env, "${testNode}", "mysql.status", [], null, false)
+ mysqlStatusReport['statusRaw'] = salt.runSaltProcessStep(env, testNode, "mysql.status", [], null, false)
} catch (Exception e) {
common.errorMsg('Could not determine mysql status.')
common.errorMsg(e.getMessage())
- return 256
+ mysqlStatusReport['error'] = 256
+ return mysqlStatusReport
}
- if (out) {
+
+ def status = "unknown"
+ def galeraMasterNode = salt.getReturnValues(salt.getPillar(env, testNode, "galera:master:enabled")) ? true : false
+
+ if (mysqlStatusReport['statusRaw']) {
try {
- status = validateAndPrintGaleraStatusReport(env, out, testNode)
+ status = validateAndPrintGaleraStatusReport(env, mysqlStatusReport['statusRaw'], testNode, galeraMasterNode)
} catch (Exception e) {
common.errorMsg('Could not parse the mysql status output. Check it manually.')
common.errorMsg(e.getMessage())
- return 1
}
} else {
- common.errorMsg("Mysql status response unrecognized or is empty. Response: ${out}")
- return 1024
+ common.errorMsg("Mysql status response unrecognized or is empty. Response: ${mysqlStatusReport['statusRaw']}")
}
+ if (mysqlStatusReport['clusterMembersNotAvailable']) {
+ common.errorMsg("Next nodes are unavailable: ${mysqlStatusReport['clusterMembersNotAvailable'].join(',')}")
+ }
+ if (mysqlStatusReport['clusterMembersNotAlive']) {
+ common.errorMsg("Next nodes are not connected to cluster: ${mysqlStatusReport['clusterMembersNotAlive'].join(',')}")
+ }
+
if (status == "OK") {
common.infoMsg("No errors found - MySQL status is ${status}.")
- return 0
+ return mysqlStatusReport
} else if (status == "unknown") {
common.warningMsg('MySQL status cannot be detemined')
- return 1
+ mysqlStatusReport['error'] = 1
+ return mysqlStatusReport
} else {
common.errorMsg("Errors found.")
- return 2
+ mysqlStatusReport['error'] = 2
+ return mysqlStatusReport
}
}
@@ -154,13 +179,12 @@
@return status "OK", "ERROR" or "uknown" depending on result of validation
*/
-def validateAndPrintGaleraStatusReport(env, out, minion) {
+def validateAndPrintGaleraStatusReport(env, out, minion, nodeRoleMaster=false) {
def salt = new com.mirantis.mk.Salt()
def common = new com.mirantis.mk.Common()
- if (minion == "I@galera:master") {
- role = "master"
- } else {
- role = "slave"
+ def role = 'slave'
+ if (nodeRoleMaster) {
+ role = 'master'
}
sizeOut = salt.getReturnValues(salt.getPillar(env, minion, "galera:${role}:members"))
expected_cluster_size = sizeOut.size()
@@ -308,10 +332,10 @@
*/
def manageServiceMysql(env, targetNode, action, checkStatus=true, checkState='running') {
def salt = new com.mirantis.mk.Salt()
- salt.runSaltProcessStep(env, lastNodeTarget, "service.${action}", ['mysql'])
+ salt.runSaltProcessStep(env, targetNode, "service.${action}", ['mysql'])
if (checkStatus) {
try {
- salt.commandStatus(env, lastNodeTarget, 'service mysql status', checkState)
+ salt.commandStatus(env, targetNode, 'service mysql status', checkState)
} catch (Exception er) {
input message: "Database is not running please fix it first and only then click on PROCEED."
}
@@ -321,34 +345,62 @@
/**
* Restores Galera cluster
* @param env Salt Connection object or pepperEnv
- * @param runRestoreDb Boolean to determine if the restoration of DB should be run as well
+ * @param galeraStatus Map, Status of Galera cluster output from verifyGaleraStatus func
+ * @param restoreDb Run restore DB procedure
* @return output of salt commands
*/
-def restoreGaleraCluster(env, runRestoreDb=true) {
+def restoreGaleraCluster(env, galeraStatus, restoreDb=true) {
def salt = new com.mirantis.mk.Salt()
def common = new com.mirantis.mk.Common()
- lastNodeTarget = getGaleraLastShutdownNode(env)
- manageServiceMysql(env, lastNodeTarget, 'stop', false)
- if (runRestoreDb) {
- salt.cmdRun(env, lastNodeTarget, "mkdir -p /root/mysql/mysql.bak")
- salt.cmdRun(env, lastNodeTarget, "rm -rf /root/mysql/mysql.bak/*")
- salt.cmdRun(env, lastNodeTarget, "mv /var/lib/mysql/* /root/mysql/mysql.bak")
+ def nodesToRecover = []
+ def total = false // whole cluster
+ if (galeraStatus['clusterMembersNotAlive']) {
+ nodesToRecover = galeraStatus['clusterMembersNotAlive']
+ if (galeraStatus['clusterMembersInClusterAlive'].size() == 0) {
+ total = true
+ }
+ } else {
+ nodesToRecover = galeraStatus['clusterMembersInClusterAlive']
+ total = true
}
- salt.cmdRun(env, lastNodeTarget, "rm -f /var/lib/mysql/.galera_bootstrap")
- // make sure that gcom parameter is empty
- salt.cmdRun(env, lastNodeTarget, "sed -i '/gcomm/c\\wsrep_cluster_address=\"gcomm://\"' /etc/mysql/my.cnf")
+ def lastNodeTarget = ''
+ if (total) {
+ manageServiceMysql(env, 'I@galera:slave', 'stop', true, 'inactive')
+ manageServiceMysql(env, 'I@galera:master', 'stop', true, 'inactive')
+ lastNodeTarget = getGaleraLastShutdownNode(env) // in case if master was already down before
+ salt.cmdRun(env, "( I@galera:master or I@galera:slave ) and not ${lastNodeTarget}", "rm -f /var/lib/mysql/ib_logfile*")
+ salt.cmdRun(env, "( I@galera:master or I@galera:slave ) and not ${lastNodeTarget}", "rm -f /var/lib/mysql/grastate.dat")
+ } else {
+ lastNodeTarget = nodesToRecover.join(' or ')
+ manageServiceMysql(env, lastNodeTarget, 'stop', true, 'inactive')
+ }
- // run restore of DB
- if (runRestoreDb) {
+ if (restoreDb) {
+ def timestamp = common.getDatetime()
+ salt.cmdRun(env, lastNodeTarget, "mkdir -p /root/mysql")
+ def bakDir = salt.getReturnValues(salt.cmdRun(env, lastNodeTarget, "mktemp -d --suffix='_${timestamp}' /root/mysql/mysql.bak.XXXXXX", false))
+ salt.cmdRun(env, lastNodeTarget, "mv /var/lib/mysql/* ${bakDir} || echo 'Nothing to backup from directory /var/lib/mysql/'")
+ }
+ if (total) {
+ // make sure that gcom parameter is empty
+ salt.cmdRun(env, lastNodeTarget, "sed -i '/gcomm/c\\wsrep_cluster_address=\"gcomm://\"' /etc/mysql/my.cnf")
+ } else if (!restoreDb) {
+ // node rejoin
+ salt.cmdRun(env, lastNodeTarget, "rm -f /var/lib/mysql/ib_logfile*")
+ salt.cmdRun(env, lastNodeTarget, "rm -f /var/lib/mysql/grastate.dat")
+ }
+
+ if (restoreDb) {
restoreGaleraDb(env, lastNodeTarget)
}
manageServiceMysql(env, lastNodeTarget, 'start')
- // apply any changes in configuration and return value to gcom parameter and then restart mysql to catch
- salt.enforceState(['saltId': env, 'target': lastNodeTarget, 'state': 'galera'])
- manageServiceMysql(env, lastNodeTarget, 'restart')
+ if (total) {
+ manageServiceMysql(env, "( I@galera:master or I@galera:slave ) and not ${lastNodeTarget}", 'start')
+ salt.runSaltProcessStep(env, lastNodeTarget, 'state.sls_id', ['galera_config', 'galera'])
+ }
}
/**