Measure metadata downtime during live-migration This patch adds to test_server_connectivity_live_migration a verification of the downtime that affects the metadata service during a live-migration. A script is executed on the VM instance in background while the instance is live-migrated. After live-migration is completed, the script output shows the duration of the metadata outage and the test compares that duration with a configured threshold. Change-Id: Id1c58a2fec14ff268c22efc3551fa36f3480f1a0

commit: b530c1ffb70110ab3d9112c90cd55c48db0285a0 [log] [tgz]
author: Eduardo Olivares <eolivare@redhat.com> Thu Oct 03 16:14:09 2024 +0200
committer: Eduardo Olivares <eolivare@redhat.com> Sat Oct 26 13:48:47 2024 +0000
tree: de6c3743a66bb779eb37dceaca834c54cdc79aab
parent: 0e599c0d1f372518083804c71366a8140aea7d50 [diff]
diff --git a/tempest/common/utils/net_downtime.py b/tempest/common/utils/net_downtime.py
index 9675ec8..6f2a947 100644
--- a/tempest/common/utils/net_downtime.py
+++ b/tempest/common/utils/net_downtime.py

@@ -22,6 +22,32 @@
 
 LOG = log.getLogger(__name__)
 
+PASSED = 'PASSED'
+FAILED = 'FAILED'
+METADATA_SCRIPT_PATH = '/tmp/metadata_meter_script.sh'
+METADATA_RESULTS_PATH = '/tmp/metadata_meter.log'
+METADATA_PID_PATH = '/tmp/metadata_meter.pid'
+# /proc/uptime is used because it include two decimals in cirros, while
+# `date +%s.%N` does not work in cirros (min granularity is seconds)
+METADATA_SCRIPT = """#!/bin/sh
+echo $$ > %(metadata_meter_pidfile)s
+old_time=$(cut -d" " -f1 /proc/uptime)
+while true; do
+    curl http://169.254.169.254/latest/meta-data/hostname 2>/dev/null | \
+grep -q `hostname`
+    result=$?
+    new_time=$(cut -d" " -f1 /proc/uptime)
+    runtime=$(awk -v new=$new_time -v old=$old_time "BEGIN {print new-old}")
+    old_time=$new_time
+    if [ $result -eq 0 ]; then
+        echo "PASSED $runtime"
+    else
+        echo "FAILED $runtime"
+    fi
+    sleep %(interval)s
+done
+"""
+
 
 class NetDowntimeMeter(fixtures.Fixture):
     def __init__(self, dest_ip, interval='0.2'):
@@ -61,3 +87,63 @@
         else:
             LOG.warning('Unexpected output obtained from the pinger: %s',
                         output)
+
+
+class MetadataDowntimeMeter(fixtures.Fixture):
+    def __init__(self, ssh_client,
+                 interval='0.2', script_path=METADATA_SCRIPT_PATH,
+                 output_path=METADATA_RESULTS_PATH,
+                 pidfile_path=METADATA_PID_PATH):
+        self.ssh_client = ssh_client
+        self.interval = interval
+        self.script_path = script_path
+        self.output_path = output_path
+        self.pidfile_path = pidfile_path
+        self.pid = None
+
+    def _setUp(self):
+        self.addCleanup(self.cleanup)
+        self.upload_metadata_script()
+        self.run_metadata_script()
+
+    def upload_metadata_script(self):
+        metadata_script = METADATA_SCRIPT % {
+            'metadata_meter_pidfile': self.pidfile_path,
+            'interval': self.interval}
+        echo_cmd = "echo '{}' > {}".format(
+            metadata_script, self.script_path)
+        chmod_cmd = 'chmod +x {}'.format(self.script_path)
+        self.ssh_client.exec_command(';'.join((echo_cmd, chmod_cmd)))
+        LOG.debug('script created: %s', self.script_path)
+        LOG.debug(self.ssh_client.exec_command(
+            'cat {}'.format(self.script_path)))
+
+    def run_metadata_script(self):
+        self.ssh_client.exec_command('{} > {} &'.format(self.script_path,
+                                                        self.output_path))
+        self.pid = self.ssh_client.exec_command(
+            'cat {}'.format(self.pidfile_path)).strip()
+        LOG.debug('running metadata downtime meter script in background with '
+                  'PID = %s', self.pid)
+
+    def get_results(self):
+        output = self.ssh_client.exec_command(
+            'cat {}'.format(self.output_path))
+        results = {}
+        results['successes'] = output.count(PASSED)
+        results['failures'] = output.count(FAILED)
+        downtime = {PASSED: 0.0, FAILED: 0.0}
+        for line in output.splitlines():
+            key, value = line.strip().split()
+            downtime[key] += float(value)
+
+        results['downtime'] = downtime
+        LOG.debug('metadata downtime meter results: %r', results)
+        return results
+
+    def cleanup(self):
+        if self.pid:
+            self.ssh_client.exec_command('kill {}'.format(self.pid))
+            LOG.debug('killed metadata downtime script with PID %s', self.pid)
+        else:
+            LOG.debug('No metadata downtime script found')

diff --git a/tempest/config.py b/tempest/config.py
index bdc1dce..00d42cb 100644
--- a/tempest/config.py
+++ b/tempest/config.py

@@ -964,6 +964,12 @@
                       "migration, in seconds. "
                       "When the measured downtime exceeds this value, an "
                       "exception is raised."),
+    cfg.FloatOpt('allowed_metadata_downtime',
+                 default=6.0,
+                 help="Allowed VM metadata connection downtime during live "
+                      "migration, in seconds. "
+                      "When the measured downtime exceeds this value, an "
+                      "exception is raised."),
 ]
 
 volume_group = cfg.OptGroup(name='volume',

diff --git a/tempest/scenario/test_network_advanced_server_ops.py b/tempest/scenario/test_network_advanced_server_ops.py
index 911ff42..f4ee98d 100644
--- a/tempest/scenario/test_network_advanced_server_ops.py
+++ b/tempest/scenario/test_network_advanced_server_ops.py

@@ -17,6 +17,7 @@
 
 from oslo_log import log
 from tempest.common import utils
+from tempest.common.utils.linux import remote_client
 from tempest.common.utils import net_downtime
 from tempest.common import waiters
 from tempest import config
@@ -189,6 +190,12 @@
             floating_ip['floating_ip_address'])
         self.useFixture(downtime_meter)
 
+        metadata_downtime_meter = net_downtime.MetadataDowntimeMeter(
+            remote_client.RemoteClient(floating_ip['floating_ip_address'],
+                                       CONF.validation.image_ssh_user,
+                                       pkey=keypair['private_key']))
+        self.useFixture(metadata_downtime_meter)
+
         migration_kwargs = {'host': None, 'block_migration': block_migration}
 
         # check if microversion is less than 2.25 because of
@@ -230,6 +237,18 @@
             "Downtime of {} seconds is higher than expected '{}'".format(
                 downtime, allowed_downtime))
 
+        metadata_downtime_results = metadata_downtime_meter.get_results()
+        self.assertGreater(metadata_downtime_results['successes'], 0)
+        LOG.debug("Metadata Downtime seconds measured = %r",
+                  metadata_downtime_results['downtime'])
+        allowed_metadata_downtime = CONF.validation.allowed_metadata_downtime
+        metadata_downtime_failed = \
+            metadata_downtime_results['downtime']['FAILED']
+        self.assertLessEqual(
+            metadata_downtime_failed, allowed_metadata_downtime,
+            "Metadata downtime: {} seconds is higher than expected: {}".format(
+                metadata_downtime_failed, allowed_metadata_downtime))
+
     def _test_server_connectivity_cold_migration_revert(self, source_host=None,
                                                         dest_host=None):
         keypair = self.create_keypair(client=self.keypairs_client)
commit	b530c1ffb70110ab3d9112c90cd55c48db0285a0	[log] [tgz]
author	Eduardo Olivares <eolivare@redhat.com>	Thu Oct 03 16:14:09 2024 +0200
committer	Eduardo Olivares <eolivare@redhat.com>	Sat Oct 26 13:48:47 2024 +0000
tree	de6c3743a66bb779eb37dceaca834c54cdc79aab
parent	0e599c0d1f372518083804c71366a8140aea7d50 [diff]