blob: 44658ea736c74528ae461070e277746d8b6ac893 [file] [log] [blame]
# Copyright 2017 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import pytest
from devops.helpers import helpers
from tcp_tests import logger
LOG = logger.logger
class TestFailoverCeph(object):
"""Test class for testing MCP Ceph failover"""
TEMPEST_JOB_NAME = 'cvp-tempest'
TEMPEST_JOB_PARAMETERS = {
'TEMPEST_ENDPOINT_TYPE': 'internalURL',
'TEMPEST_TEST_PATTERN': 'set=smoke'
}
SANITY_JOB_NAME = 'cvp-sanity'
SANITY_JOB_PARAMETERS = {
'EXTRA_PARAMS': {
'envs': [
"tests_set=-k "
"'not test_ceph_health and not test_prometheus_alert_count'"
]
}
}
JENKINS_START_TIMEOUT = 60
JENKINS_BUILD_TIMEOUT = 60 * 15
def get_ceph_health(self, ssh, node_names):
"""Get Ceph health status on specified nodes
:param ssh: UnderlaySSHManager, tcp-qa SSH manager instance
:param node_names: list, full hostnames of Ceph OSD nodes
:return: dict, Ceph health status from each OSD node (output of
'ceph -s' command executed on each node)
"""
return {
node_name: ssh.check_call(
"ceph -s",
node_name=node_name,
raise_on_err=False)['stdout_str']
for node_name in node_names
}
@pytest.mark.grab_versions
@pytest.mark.restart_osd_node
def test_restart_osd_node(
self,
salt_actions,
drivetrain_actions,
underlay_actions,
show_step):
"""Verify that Ceph OSD node is not affected by system restart
Scenario:
1. Find Ceph OSD nodes
2. Check Ceph cluster health before node restart (skipped until
PROD-31374 is fixed)
3. Restart 1 Ceph OSD node
4. Check Ceph cluster health after node restart (skipped until
PROD-31374 is fixed)
5. Run Tempest smoke test suite
6. Run test_ceph_status.py::test_ceph_osd and
test_services.py::test_check_services[osd] sanity tests
Duration: ~9 min
"""
salt = salt_actions
ssh = underlay_actions
dt = drivetrain_actions
# Find Ceph OSD nodes
show_step(1)
tgt = "I@ceph:osd"
osd_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
# Select a node for the test
osd_host = osd_hosts[0]
# Check Ceph cluster health before node restart
show_step(2)
ceph_health = self.get_ceph_health(ssh, osd_hosts)
# FIXME: uncomment the check once PROD-31374 is fixed
# status = all(
# ["OK" in status for node, status in ceph_health.items()])
# assert status, "Ceph health is not OK: {0}".format(ceph_health)
# Restart a Ceph OSD node
show_step(3)
LOG.info("Sending reboot command to '{}' node.".format(osd_host))
remote = ssh.remote(node_name=osd_host)
remote.execute_async("/sbin/shutdown -r now")
# Wait for restarted node to boot and become accessible
helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
echo_request = "echo"
echo_response = salt.local(
osd_host, "test.echo", echo_request)['return'][0]
assert echo_request == echo_response[osd_host], (
"Minion on node '{}' node is not responding after node "
"reboot.".format(osd_host)
)
LOG.info("'{}' node is back after reboot.".format(osd_host))
# Check Ceph cluster health after node restart
show_step(4)
ceph_health = self.get_ceph_health(ssh, osd_hosts) # noqa
# FIXME: uncomment the check once PROD-31374 is fixed
# status = all(
# ["OK" in status for node, status in ceph_health.items()])
# assert status, "Ceph health is not OK: {0}".format(ceph_health)
# Run Tempest smoke test suite
show_step(5)
status = dt.start_job_on_jenkins(
job_name=self.TEMPEST_JOB_NAME,
job_parameters=self.TEMPEST_JOB_PARAMETERS,
start_timeout=self.JENKINS_START_TIMEOUT,
build_timeout=self.JENKINS_BUILD_TIMEOUT
)
assert status == 'SUCCESS', (
"'{0}' job run status is {1} after executing Tempest smoke "
"tests".format(
self.TEMPEST_JOB_NAME, status)
)
# Run Sanity test
show_step(6)
status = dt.start_job_on_jenkins(
job_name=self.SANITY_JOB_NAME,
job_parameters=self.SANITY_JOB_PARAMETERS,
start_timeout=self.JENKINS_START_TIMEOUT,
build_timeout=self.JENKINS_BUILD_TIMEOUT
)
assert status == 'SUCCESS', (
"'{0}' job run status is {1} after executing selected sanity "
"tests".format(
self.SANITY_JOB_NAME, status)
)
@pytest.mark.grab_versions
@pytest.mark.restart_cmn_node
def test_restart_cmn_node(
self,
salt_actions,
drivetrain_actions,
underlay_actions,
show_step):
"""Verify that Ceph CMN node is not affected by system restart
Scenario:
1. Find Ceph CMN nodes
2. Check Ceph cluster health before node restart (skipped until
PROD-31374 is fixed)
3. Restart 1 Ceph CMN node
4. Check Ceph cluster health after node restart (skipped until
PROD-31374 is fixed)
5. Run Tempest smoke test suite
6. Run test_ceph_status.py::test_ceph_replicas and
test_services.py::test_check_services[cmn] sanity tests
Duration: ~9 min
"""
salt = salt_actions
ssh = underlay_actions
dt = drivetrain_actions
# Find Ceph CMN nodes
show_step(1)
tgt = "I@ceph:mon"
cmn_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
# Select a node for the test
cmn_host = cmn_hosts[0]
# Check Ceph cluster health before node restart
show_step(2)
ceph_health = self.get_ceph_health(ssh, cmn_hosts)
# FIXME: uncomment the check once PROD-31374 is fixed
# status = all(
# ["OK" in status for node, status in ceph_health.items()])
# assert status, "Ceph health is not OK: {0}".format(ceph_health)
# Restart a Ceph CMN node
show_step(3)
LOG.info("Sending reboot command to '{}' node.".format(cmn_host))
remote = ssh.remote(node_name=cmn_host)
remote.execute_async("/sbin/shutdown -r now")
# Wait for restarted node to boot and become accessible
helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
echo_request = "echo"
echo_response = salt.local(
cmn_host, "test.echo", echo_request)['return'][0]
assert echo_request == echo_response[cmn_host], (
"Minion on node '{}' node is not responding after node "
"reboot.".format(cmn_host)
)
LOG.info("'{}' node is back after reboot.".format(cmn_host))
# Check Ceph cluster health after node restart
show_step(4)
ceph_health = self.get_ceph_health(ssh, cmn_hosts) # noqa
# FIXME: uncomment the check once PROD-31374 is fixed
# status = all(
# ["OK" in status for node, status in ceph_health.items()])
# assert status, "Ceph health is not OK: {0}".format(ceph_health)
# Run Tempest smoke test suite
show_step(5)
status = dt.start_job_on_jenkins(
job_name=self.TEMPEST_JOB_NAME,
job_parameters=self.TEMPEST_JOB_PARAMETERS,
start_timeout=self.JENKINS_START_TIMEOUT,
build_timeout=self.JENKINS_BUILD_TIMEOUT
)
assert status == 'SUCCESS', (
"'{0}' job run status is {1} after executing Tempest smoke "
"tests".format(
self.TEMPEST_JOB_NAME, status)
)
# Run Sanity test
show_step(6)
status = dt.start_job_on_jenkins(
job_name=self.SANITY_JOB_NAME,
job_parameters=self.SANITY_JOB_PARAMETERS,
start_timeout=self.JENKINS_START_TIMEOUT,
build_timeout=self.JENKINS_BUILD_TIMEOUT
)
assert status == 'SUCCESS', (
"'{0}' job run status is {1} after executing selected sanity "
"tests".format(
self.SANITY_JOB_NAME, status)
)
@pytest.mark.grab_versions
@pytest.mark.restart_rgw_node
def test_restart_rgw_node(
self,
salt_actions,
drivetrain_actions,
underlay_actions,
show_step):
"""Verify that Ceph RGW node is not affected by system restart
Scenario:
1. Find Ceph RGW nodes
2. Check Ceph cluster health before node restart (skipped until
PROD-31374 is fixed)
3. Restart 1 Ceph RGW node
4. Check Ceph cluster health after node restart (skipped until
PROD-31374 is fixed)
5. Run Tempest smoke test suite
6. Run test_services.py::test_check_services[rgw] sanity test
Duration: ~9 min
"""
salt = salt_actions
ssh = underlay_actions
dt = drivetrain_actions
# Find Ceph RGW nodes
show_step(1)
tgt = "I@ceph:radosgw"
rgw_hosts = salt.local(tgt, "test.ping")['return'][0].keys()
# Select a node for the test
rgw_host = rgw_hosts[0]
# Check Ceph cluster health before node restart
show_step(2)
ceph_health = self.get_ceph_health(ssh, rgw_hosts)
# FIXME: uncomment the check once PROD-31374 is fixed
# status = all(
# ["OK" in status for node, status in ceph_health.items()])
# assert status, "Ceph health is not OK: {0}".format(ceph_health)
# Restart a Ceph RGW node
show_step(3)
LOG.info("Sending reboot command to '{}' node.".format(rgw_host))
remote = ssh.remote(node_name=rgw_host)
remote.execute_async("/sbin/shutdown -r now")
# Wait for restarted node to boot and become accessible
helpers.wait_pass(remote.reconnect, timeout=60 * 3, interval=5)
echo_request = "echo"
echo_response = salt.local(
rgw_host, "test.echo", echo_request)['return'][0]
assert echo_request == echo_response[rgw_host], (
"Minion on node '{}' node is not responding after node "
"reboot.".format(rgw_host)
)
LOG.info("'{}' node is back after reboot.".format(rgw_host))
# Check Ceph cluster health after node restart
show_step(4)
ceph_health = self.get_ceph_health(ssh, rgw_hosts) # noqa
# FIXME: uncomment the check once PROD-31374 is fixed
# status = all(
# ["OK" in status for node, status in ceph_health.items()])
# assert status, "Ceph health is not OK: {0}".format(ceph_health)
# Run Tempest smoke test suite
show_step(5)
status = dt.start_job_on_jenkins(
job_name=self.TEMPEST_JOB_NAME,
job_parameters=self.TEMPEST_JOB_PARAMETERS,
start_timeout=self.JENKINS_START_TIMEOUT,
build_timeout=self.JENKINS_BUILD_TIMEOUT
)
assert status == 'SUCCESS', (
"'{0}' job run status is {1} after executing Tempest smoke "
"tests".format(
self.TEMPEST_JOB_NAME, status)
)
# Run Sanity test
show_step(6)
status = dt.start_job_on_jenkins(
job_name=self.SANITY_JOB_NAME,
job_parameters=self.SANITY_JOB_PARAMETERS,
start_timeout=self.JENKINS_START_TIMEOUT,
build_timeout=self.JENKINS_BUILD_TIMEOUT
)
assert status == 'SUCCESS', (
"'{0}' job run status is {1} after executing selected sanity "
"tests".format(
self.SANITY_JOB_NAME, status)
)
# #######################################################################
# ############# Tests for fuel-devops deployed environments #############
# #######################################################################
def show_failed_msg(self, failed):
return "There are failed tempest tests:\n\n {0}".format(
'\n\n '.join([(name + ': ' + detail)
for name, detail in failed.items()]))
@pytest.mark.grab_versions
@pytest.mark.fail_snapshot
def _test_restart_osd_node(self, func_name, underlay, config,
openstack_deployed, ceph_deployed,
openstack_actions, hardware,
rally, show_step):
"""Test restart ceph osd node
Scenario:
1. Find ceph osd nodes
2. Check ceph health before restart
3. Restart 1 ceph osd node
4. Check ceph health after restart
5. Run tempest smoke after failover
6. Check tempest report for failed tests
Requiremets:
- Salt cluster
- OpenStack cluster
- Ceph cluster
"""
openstack_actions._salt.local(
tgt='*', fun='cmd.run',
args='service ntp stop; ntpd -gq; service ntp start')
# STEP #1
show_step(1)
osd_node_names = underlay.get_target_node_names(
target='osd')
# STEP #2
show_step(2)
# Get the ceph health output before restart
health_before = self.get_ceph_health(underlay, osd_node_names)
assert all(["OK" in p for n, p in health_before.items()]), (
"'Ceph health is not ok from node: {0}".format(health_before))
# STEP #3
show_step(3)
hardware.warm_restart_nodes(underlay, 'osd01')
openstack_actions._salt.local(
tgt='*', fun='cmd.run',
args='service ntp stop; ntpd -gq; service ntp start')
# STEP #4
show_step(4)
# Get the ceph health output after restart
health_after = self.get_ceph_health(underlay, osd_node_names)
assert all(["OK" in p for n, p in health_before.items()]), (
"'Ceph health is not ok from node: {0}".format(health_after))
rally.run_container()
# STEP #5
show_step(5)
results = rally.run_tempest(pattern='set=smoke',
conf_name='/var/lib/ceph_mcp.conf',
report_prefix=func_name,
designate_plugin=False,
timeout=1800)
# Step #6
show_step(6)
assert not results['fail'], self.show_failed_msg(results['fail'])
LOG.info("*************** DONE **************")
@pytest.mark.grab_versions
@pytest.mark.fail_snapshot
def _test_restart_cmn_node(self, func_name, underlay, config,
openstack_deployed, ceph_deployed,
core_actions,
salt_actions, openstack_actions,
rally, show_step, hardware):
"""Test restart ceph cmn node
Scenario:
1. Find ceph cmn nodes
2. Check ceph health before restart
3. Restart 1 ceph cmn node
4. Check ceph health after restart
5. Run tempest smoke after failover
6. Check tempest report for failed tests
Requiremets:
- Salt cluster
- OpenStack cluster
- Ceph cluster
"""
openstack_actions._salt.local(
tgt='*', fun='cmd.run',
args='service ntp stop; ntpd -gq; service ntp start')
# STEP #1
show_step(1)
cmn_node_names = underlay.get_target_node_names(
target='cmn')
# STEP #2
show_step(2)
# Get the ceph health output before restart
health_before = self.get_ceph_health(underlay, cmn_node_names)
assert all(["OK" in p for n, p in health_before.items()]), (
"'Ceph health is not ok from node: {0}".format(health_before))
# STEP #3
show_step(3)
hardware.warm_restart_nodes(underlay, 'cmn01')
openstack_actions._salt.local(
tgt='*', fun='cmd.run',
args='service ntp stop; ntpd -gq; service ntp start')
# STEP #4
show_step(4)
# Get the ceph health output after restart
health_after = self.get_ceph_health(underlay, cmn_node_names)
assert all(["OK" in p for n, p in health_before.items()]), (
"'Ceph health is not ok from node: {0}".format(health_after))
rally.run_container()
# STEP #5
show_step(5)
results = rally.run_tempest(pattern='set=smoke',
conf_name='/var/lib/ceph_mcp.conf',
report_prefix=func_name,
designate_plugin=False,
timeout=1800)
# Step #6
show_step(6)
assert not results['fail'], self.show_failed_msg(results['fail'])
LOG.info("*************** DONE **************")
@pytest.mark.grab_versions
@pytest.mark.fail_snapshot
def _test_restart_rgw_node(self, func_name, underlay, config,
openstack_deployed, ceph_deployed,
core_actions, hardware,
salt_actions, openstack_actions,
rally, show_step):
"""Test restart ceph rgw node
Scenario:
1. Find ceph rgw nodes
2. Check ceph health before restart
3. Restart 1 ceph rgw node
4. Check ceph health after restart
5. Run tempest smoke after failover
6. Check tempest report for failed tests
Requiremets:
- Salt cluster
- OpenStack cluster
- Ceph cluster
"""
openstack_actions._salt.local(
tgt='*', fun='cmd.run',
args='service ntp stop; ntpd -gq; service ntp start')
# STEP #1
show_step(1)
rgw_node_names = underlay.get_target_node_names(
target='rgw')
if not rgw_node_names:
pytest.skip('Skip as there are not rgw nodes in deploy')
# STEP #2
show_step(2)
# Get the ceph health output before restart
health_before = self.get_ceph_health(underlay, rgw_node_names)
assert all(["OK" in p for n, p in health_before.items()]), (
"'Ceph health is not ok from node: {0}".format(health_before))
# STEP #3
show_step(3)
hardware.warm_restart_nodes(underlay, 'rgw01')
openstack_actions._salt.local(
tgt='*', fun='cmd.run',
args='service ntp stop; ntpd -gq; service ntp start')
# STEP #4
show_step(4)
# Get the ceph health output after restart
health_after = self.get_ceph_health(underlay, rgw_node_names)
assert all(["OK" in p for n, p in health_before.items()]), (
"'Ceph health is not ok from node: {0}".format(health_after))
rally.run_container()
# STEP #5
show_step(5)
results = rally.run_tempest(pattern='set=smoke',
conf_name='/var/lib/ceph_mcp.conf',
designate_plugin=False,
report_prefix=func_name,
timeout=1800)
# Step #6
show_step(6)
assert not results['fail'], self.show_failed_msg(results['fail'])
LOG.info("*************** DONE **************")