Cassandra repair pipeline fixes:
- stop neutron-server for OC 4.X config db repair case;
- stop contral and analytic containers during repair procedure to prevent
requests to configdb;
- add verification of cassandra db readiness before backup procedure;
- increased number of retries for final health check.
Change-Id: I0e88d7a013e5452fccbf63c4141afb31aac2b9bd
Related-PROD: PROD-31354
diff --git a/restore-cassandra.groovy b/restore-cassandra.groovy
index bdcf2cd..fb1259f 100644
--- a/restore-cassandra.groovy
+++ b/restore-cassandra.groovy
@@ -38,53 +38,70 @@
}
stage('Restore') {
+ // stop neutron-server to prevent CRUD api calls to contrail-api service
+ try {
+ salt.runSaltProcessStep(pepperEnv, 'I@neutron:server', 'service.stop', ['neutron-server'], null, true)
+ } catch (Exception er) {
+ common.warningMsg('neutron-server service already stopped')
+ }
// get opencontrail version
def contrailVersion = getValueForPillarKey(pepperEnv, "I@opencontrail:control:role:primary", "_param:opencontrail_version")
+ def configDbIp = getValueForPillarKey(pepperEnv, "I@opencontrail:control:role:primary", "opencontrail:database:bind:host")
+ def configDbPort = getValueForPillarKey(pepperEnv, "I@opencontrail:control:role:primary", "opencontrail:database:bind:port_configdb")
common.infoMsg("OpenContrail version is ${contrailVersion}")
if (contrailVersion.startsWith('4')) {
controllerImage = getValueForPillarKey(pepperEnv, "I@opencontrail:control:role:primary",
"docker:client:compose:opencontrail:service:controller:container_name")
common.infoMsg("Applying db restore procedure for OpenContrail 4.X version")
try {
- salt.cmdRun(pepperEnv, 'I@opencontrail:control', 'doctrail controller systemctl stop contrail-database' )
+ common.infoMsg("Stop contrail control plane containers")
+ salt.cmdRun(pepperEnv, 'I@opencontrail:control or I@opencontrail:collector', 'cd /etc/docker/compose/opencontrail/; docker-compose down')
} catch (Exception err) {
- common.errorMsg('An error has been occurred during cassandra db shutdown: ' + err.getMessage())
+ common.errorMsg('An error has been occurred during contrail containers shutdown: ' + err.getMessage())
throw err
}
try {
- salt.cmdRun(pepperEnv, 'I@opencontrail:control', "docker exec ${controllerImage} bash -c 'for f in \$(ls /var/lib/cassandra/); do rm -r /var/lib/cassandra/\$f; done'")
+ common.infoMsg("Cleanup cassandra data")
+ salt.cmdRun(pepperEnv, 'I@opencontrail:control', 'for f in $(ls /var/lib/configdb/); do rm -r /var/lib/configdb/$f; done')
} catch (Exception err) {
- common.errorMsg('Cannot cleanup cassandra data: ' + err.getMessage())
+ common.errorMsg('Cannot cleanup cassandra data on control nodes: ' + err.getMessage())
throw err
}
try {
- salt.cmdRun(pepperEnv, 'I@cassandra:backup:client', 'doctrail controller systemctl start contrail-database' )
+ common.infoMsg("Start cassandra db on I@cassandra:backup:client node")
+ salt.cmdRun(pepperEnv, 'I@cassandra:backup:client', 'cd /etc/docker/compose/opencontrail/; docker-compose up -d')
} catch (Exception err) {
- common.errorMsg('An error has been occurred during cassandra db startup: ' + err.getMessage())
+ common.errorMsg('An error has been occurred during cassandra db startup on I@cassandra:backup:client node: ' + err.getMessage())
throw err
}
+ // wait for cassandra to be online
+ common.retry(6, 20){
+ common.infoMsg("Trying to connect to casandra db on I@cassandra:backup:client node ...")
+ salt.cmdRun(pepperEnv, 'I@cassandra:backup:client', "nc -v -z -w2 ${configDbIp} ${configDbPort}")
+ }
// remove restore-already-happened file if any is present
try {
salt.cmdRun(pepperEnv, 'I@cassandra:backup:client', 'rm /var/backups/cassandra/dbrestored')
} catch (Exception err) {
common.warningMsg('/var/backups/cassandra/dbrestored not present? ' + err.getMessage())
}
- // perform restore steps
salt.enforceState(pepperEnv, 'I@cassandra:backup:client', "cassandra")
- salt.runSaltProcessStep(pepperEnv, 'I@cassandra:backup:client', 'system.reboot', null, [], true, 5)
- sleep(5)
- salt.runSaltProcessStep(pepperEnv, 'I@opencontrail:control and not I@cassandra:backup:client', 'system.reboot', null, [], true, 5)
- // the lovely wait-60-seconds mantra before restarting supervisor-database service
- sleep(60)
- salt.cmdRun(pepperEnv, 'I@opencontrail:control', "doctrail controller systemctl restart contrail-database")
+ try {
+ salt.cmdRun(pepperEnv, 'I@opencontrail:control and not I@cassandra:backup:client', 'cd /etc/docker/compose/opencontrail/; docker-compose up -d')
+ } catch (Exception err) {
+ common.errorMsg('An error has been occurred during cassandra db startup on I@opencontrail:control and not I@cassandra:backup:client nodes: ' + err.getMessage())
+ throw err
+ }
// another mantra, wait till all services are up
sleep(60)
- } else {
try {
- salt.runSaltProcessStep(pepperEnv, 'I@neutron:server', 'service.stop', ['neutron-server'], null, true)
- } catch (Exception er) {
- common.warningMsg('neutron-server service already stopped')
+ common.infoMsg("Start analytics containers node")
+ salt.cmdRun(pepperEnv, 'I@opencontrail:collector', 'cd /etc/docker/compose/opencontrail/; docker-compose up -d')
+ } catch (Exception err) {
+ common.errorMsg('An error has been occurred during analytics containers startup: ' + err.getMessage())
+ throw err
}
+ } else {
try {
salt.runSaltProcessStep(pepperEnv, 'I@opencontrail:control', 'service.stop', ['supervisor-config'], null, true)
} catch (Exception er) {
@@ -135,7 +152,6 @@
sleep(5)
salt.runSaltProcessStep(pepperEnv, 'I@opencontrail:control', 'service.restart', ['supervisor-database'], null, true)
- salt.runSaltProcessStep(pepperEnv, 'I@neutron:server', 'service.start', ['neutron-server'], null, true)
// wait until contrail-status is up
salt.commandStatus(pepperEnv, 'I@opencontrail:control', "contrail-status | grep -v == | grep -v \'disabled on boot\' | grep -v nodemgr | grep -v active | grep -v backup", null, false)
@@ -143,10 +159,12 @@
salt.cmdRun(pepperEnv, 'I@opencontrail:control', "nodetool status")
salt.cmdRun(pepperEnv, 'I@opencontrail:control', "contrail-status")
}
+
+ salt.runSaltProcessStep(pepperEnv, 'I@neutron:server', 'service.start', ['neutron-server'], null, true)
}
stage('Opencontrail controllers health check') {
- common.retry(3, 20){
+ common.retry(9, 20){
salt.enforceState(pepperEnv, 'I@opencontrail:control or I@opencontrail:collector', 'opencontrail.upgrade.verify', true, true)
}
}