import random
from datetime import datetime, timezone

import base64
import dateutil.parser
import exec_helpers
import yaml
import time
import json
import kubernetes
import socket
import re
import pytest
from tabulate import tabulate
import toml
import pytz

from retry import retry
from si_tests.utils import packaging_version as version
from urllib3.exceptions import MaxRetryError, ProtocolError
from si_tests import settings
from si_tests import logger
from si_tests.utils import utils, waiters, exceptions, templates, helpers
from kubernetes.client.rest import ApiException
from websocket._exceptions import WebSocketBadStatusException
from OpenSSL.SSL import Connection, Context, SSLv23_METHOD, TLSv1_2_METHOD
from si_tests.fixtures.cluster import collect_cluster_readiness

from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from si_tests.managers.kaas_manager import Cluster, Machine


LOG = logger.logger


def kill_process_in_container_by_word(node, cont_name, grep_word):
    """
    Kill process in docker container by expected word in name.
    This function finds and kills all PIDs which contains
    grep_word in args
    """
    pids = node._run_cmd(
        f"docker container top {cont_name} -o pid,args | grep {grep_word} | cut -d ' ' -f 1",
        ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
    assert pids, f'PIDs have not been found in container: {cont_name}, on node: {node.name}'
    pids = pids.split('\n')[0]
    node._run_cmd(f"sudo kill -9 {pids}",
                  ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
    LOG.info(f'Processes in container: {cont_name} with pids: {pids} has been killed')


def restart_docker_and_check_status(node, docker_client, mke_client):
    swarm_node = mke_client.get_node_by_name(node_name=node.get_k8s_node_name())
    status = swarm_node['Status']['State']
    availability = swarm_node['Spec']['Availability']

    if availability != 'active':
        return False

    if status == 'down':
        LOG.error(f"Docker Node {node.name} status is 'down' while expected 'active',"
                  "bug PRODX-25664, workaround: restarting docker service")
        LOG.info(docker_client.node_ls(verbose=True))
        LOG.error(docker_client.docker_info(machine_name=node.name, verbose=True))
        node.run_cmd("sudo systemctl restart docker.service", ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        return False
    else:
        return True


class ClusterCheckManager(object):
    """Cluster check manager"""

    EXCLUDED_PODS = [
        'elasticsearch-curator',
        'ui-e2e-test',
        # UCP internal pods
        'ucp-nvidia-gpu-feature-discovery',
        # compose and compose-api are deprecated
        'compose',
        'compose-api',
        'tungsten-pytest'
    ]
    EXCLUDED_JOBS = [
        'elasticsearch-curator',
        'mariadb-phy-backup'
    ]
    EXCLUDED_ROLESBINDS = []

    def __init__(self, cluster: "Cluster"):
        self._cluster: "Cluster" = cluster
        self.deploys_generation = {}
        self.deploys_next_generation = {}

    @property
    def cluster(self) -> "Cluster":
        return self._cluster

    @property
    def k8sclient(self):
        return self._cluster.k8sclient

    def get_deploys_generation(self, namespace=None, deploys=None):
        deploys_generation = {}
        if not deploys and not namespace:
            deploys = self.cluster.k8sclient.deployments.list_all()
        if namespace and not deploys:
            deploys = [d for d in self.cluster.k8sclient.deployments.list(
                namespace=namespace)]
        for deploy in deploys:
            deploy_status = deploy.read().to_dict().get('status') or {}
            generation = deploy_status.get('observed_generation') or 0
            deploys_generation[deploy.name] = int(generation)
        return deploys_generation

    def store_deploys_generation(self, namespace=None, deploys=None):
        generations = self.get_deploys_generation(
            namespace=namespace, deploys=deploys)
        self.deploys_generation = generations
        self.deploys_next_generation = {k: v+1 for k, v in generations.items()}

    def wait_deploy_generation_changed(self, namespace=None, deploys=None,
                                       timeout=600, interval=60):
        LOG.info("Waiting for generations are changed")
        for k, v in self.deploys_generation.items():
            LOG.info("{}: {} -> {}".format(
                k, v, self.deploys_next_generation[k]))

        waiters.wait(lambda: self.get_deploys_generation(
            namespace=namespace,
            deploys=deploys) == self.deploys_next_generation,
            timeout=timeout, interval=interval,
            timeout_msg="Some generations are not changed. Current "
                        "generations info: \n{} but should be: \n{}".format(
            yaml.dump(self.get_deploys_generation(namespace=namespace,
                                                  deploys=deploys)),
            yaml.dump(self.deploys_next_generation)))

    def wait_for_release_in_bundle(self, name, timeout=600, interval=60):
        def does_release_exist():
            hb = self.cluster.get_helmbundle()
            if not hb:
                return False
            hb_releases = hb.data.get('spec', {}).get('releases', [])
            for r in hb_releases:
                if r['name'] == name:
                    return True
            return False

        timeout_msg = f"Timeout waiting for release {name} to appear in " \
                      f"cluster helm bundle after {timeout} sec."
        waiters.wait(does_release_exist, timeout=timeout, interval=interval,
                     timeout_msg=timeout_msg)
        LOG.info(f"Release {name} found in cluster helm bundle")

    def get_nodeworkloadlock_state(self, name):
        workloadlock = self.cluster.get_nodeworkloadlock(name)
        if not workloadlock:
            raise ValueError(f"Node workloadlock with name {name} not "
                             f"found")
        workloadlock_status = workloadlock.data.get('status') or {}
        workloadlock_state = workloadlock_status.get('state')
        return workloadlock_state

    def wait_nodeworkloadlock_state(self, name, state='active',
                                    timeout=2400, interval=60):

        def status_message():
            info = dict()
            info['expected_state'] = state
            workloadlock = self.cluster.get_nodeworkloadlock(name)
            workloadlock_status = workloadlock.data.get('status') or {}
            workloadlock_state = workloadlock_status.get('state')
            info['current_state'] = workloadlock_state
            return info

        def is_expected_state(name):
            workloadlock = self.cluster.get_nodeworkloadlock(name)
            if not workloadlock:
                raise ValueError(f"Node workloadlock with name {name} not "
                                 f"found")
            workloadlock_status = workloadlock.data.get('status') or {}
            workloadlock_state = workloadlock_status.get('state')
            return workloadlock_state == state

        waiters.wait(lambda: is_expected_state(name), timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Timeout for waiting nodeworkloadlock "
                                 f"state after {timeout} sec.",
                                 status_msg_function=status_message)

    def wait_nodemaintenancerequest_created(self, node_name=None, timeout=900, interval=30):
        """
        Wait while NodeMaintenanceRequest object created
        """
        waiters.wait(lambda: self.cluster.get_nodemaintenancerequests(name_prefix=node_name),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Timeout for waiting nodemaintenancerequest for cluster "
                                 f"after {timeout} sec.")

    def wait_nodemaintenancerequest_not_existed(self, node_name=None, timeout=2400, interval=30):
        """
        Wait while NodeMaintenanceRequest object is removed
        """
        waiters.wait(lambda: not self.cluster.get_nodemaintenancerequests(name_prefix=node_name),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Timeout for waiting nodemaintenancerequest removing for cluster "
                                 f"after {timeout} sec.")

    def get_clusterworkloadlock_state(self, name):
        workloadlock = self.cluster.get_clusterworkloadlock(name)
        if not workloadlock:
            raise ValueError(f"Cluster workloadlock with name {name} not "
                             f"found")
        workloadlock_status = workloadlock.data.get('status') or {}
        workloadlock_state = workloadlock_status.get('state')
        return workloadlock_state

    def wait_clusterworkloadlock_state(self, name, state='active',
                                       timeout=2600, interval=60):

        def status_message():
            info = dict()
            info['expected_state'] = state
            workloadlock = self.cluster.get_clusterworkloadlock(name)
            workloadlock_status = workloadlock.data.get('status') or {}
            workloadlock_state = workloadlock_status.get('state')
            info['current_state'] = workloadlock_state
            return info

        def is_expected_state(name):
            if not self.cluster.get_clusterworkloadlock(name):
                raise ValueError(f"Cluster workloadlock with name {name} not "
                                 f"found")
            workloadlock_state = self.cluster.get_clusterworkloadlock(
                name).data.get('status', {}).get('state', {})
            return workloadlock_state == state

        waiters.wait(lambda: is_expected_state(name), timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Timeout for waiting clusterworkloadlock {name}"
                                 f"state after {timeout} sec.",
                                 status_msg_function=status_message)

    def wait_clustermaintenancerequest_created(self, timeout=900, interval=30):
        """
        Wait Cluster ClusterMaintenanceRequest object
        """
        waiters.wait(lambda: self.cluster.get_clustermaintenancerequests(),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Timeout for waiting clustermaintenancerequest for cluster "
                                 f"after {timeout} sec.")

    def wait_clustermaintenancerequest_not_existed(self, timeout=900, interval=30):
        """
        Wait Cluster ClusterMaintenanceRequest object
        """
        waiters.wait(lambda: not self.cluster.get_clustermaintenancerequests(),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Timeout for waiting clustermaintenancerequest for cluster "
                                 f"after {timeout} sec.")

    def _check_maintenance_status(self, obj, expected_status: bool):
        """
        Chck maintenance status
        Args:
            obj: Cluster or Machine object
            expected_status: bool: expected status

        Returns: bool: is expected status equal to actual status

        """
        obj_status = obj.data.get('status') or {}
        actual_status = obj_status.get('providerStatus', {}).get('maintenance', False)
        LOG.info(f"Expected maintenance status is {expected_status}, actual status is {actual_status}")
        return actual_status == expected_status

    def wait_cluster_maintenance_status(self, expected_status=False, timeout=1200, interval=30):
        """
        Wait expected Cluster Maintenance Request status
        Args:
            expected_status: boolsean value
            timeout: timeout to wait
            interval: time between checks
        Returns: None
        """

        waiters.wait(lambda: self._check_maintenance_status(self.cluster, expected_status),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Timeout for waiting expected Cluster Maintenance Request status "
                                 f"after {timeout} sec.")

    def wait_machine_maintenance_status(self, machine_name, expected_status=False, timeout=3600, interval=30):
        """
        Wait expected Machine Maintenance Request status
        Args:
            machine_name: name of expected machine
            expected_status: boolean value
            timeout: timeout to wait
            interval: time between checks
        Returns: None
        """

        machine = self.cluster.get_machine(machine_name)
        waiters.wait(lambda: self._check_maintenance_status(machine, expected_status),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Timeout for waiting expected Machine Maintenance Request status "
                                 f"after {timeout} sec.")

    def wait_evacuation_status(self, machine_name, evacuation, evacuation_expected=True, timeout=900, interval=30):
        """
        Wait expected Machine Maintenance Request status
        Args:
            machine_name: name of expected machine
            evacuation: evacuation status to check
            evacuation_expected: given evacuation status is expected
            timeout: timeout to wait
            interval: time between checks
        Returns: None
        """

        lcm_machine = self.cluster.get_cluster_lcmmachine(machine_name, self.cluster.namespace)

        def check_status():
            lcm_machine_status = lcm_machine.data.get('status') or {}
            evacuation_status = lcm_machine_status.get('evacuation')
            LOG.info(f"Actual evacuation status of LCMMachine '{lcm_machine.namespace}/{lcm_machine.name}' "
                     f"is {evacuation_status}, status {evacuation} is expected: {evacuation_expected}")
            # True if lcmmachine has specified evacuation status and this status is expected
            # or if lcmmachine has status other than specified and this status is not expected
            return (evacuation_status == evacuation and evacuation_expected) or \
                   (evacuation_status != evacuation and not evacuation_expected)

        waiters.wait(lambda: check_status(),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Timeout for waiting expected LCMMachine evacuation status "
                                 f"after {timeout} sec.")

    def wait_nodedeletionrequest_created(self, name, timeout=900, interval=30):
        """
        Wait while NodeDeletionRequest object created
        """
        waiters.wait(lambda: self.cluster.get_nodedeletionrequest(name),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Timeout for waiting nodedeletionrequest {name} for cluster "
                                 f"after {timeout} sec.")

    def wait_nodedeletionrequest_not_existed(self, name, timeout=900, interval=30):
        """
        Wait while NodeDeletionRequest object is removed
        """
        waiters.wait(lambda: not self.cluster.get_nodedeletionrequest(name),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Timeout for waiting nodedeletionrequest {name} removing for cluster "
                                 f"after {timeout} sec.")

    def wait_machine_prepare_deletion_phase(self, machine_name, expected_phase='completed', timeout=1800, interval=30):
        """
        Wait expected Machine Prepare Deletion Phase
        Args:
            machine_name: name of expected Machine
            expected_phase: ('started', 'completed')
            timeout: timeout to wait
            interval: time between checks
        Returns: None
        """

        def get_phase():
            machine = self.cluster.get_machine(machine_name)
            if machine:
                try:
                    actual_phase = machine.get_prepare_deletion_phase()
                except ApiException as e:
                    LOG.error(f"Couldn't get status.providerStatus.prepareDeletionPhase "
                              f"for Machine '{machine_name}':\n{e}")
                    return False
            else:
                if expected_phase in ('completed', ''):
                    actual_phase = expected_phase
                else:
                    raise Exception(f'machine {machine_name} not found in cluster {self.cluster.name}')
            LOG.info(f"Expected machine prepare deletion phase is {expected_phase}, actual is {actual_phase}")
            return expected_phase == actual_phase

        waiters.wait(lambda: get_phase(),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Timeout for waiting expected LCMMachine prepare deletion phase"
                                 f"after {timeout} sec.")

    def _check_graceful_reboot_request(self, expected_status: bool):
        exists = self.cluster.get_gracefulrebootrequest() is not None
        return exists == expected_status

    def wait_graceful_reboot_request(self, expected_status=True, timeout=300, interval=30):
        """Wait expected Graceful Reboot Request status
        Args:
            expected_status: True if expecting existence of the object, False if expecting absence
            timeout: timeout to wait
            interval: time between checks
        Returns: None
        """
        waiters.wait(lambda: self._check_graceful_reboot_request(expected_status),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Timeout for waiting expected Graceful RebootRequest status "
                                 f"after {timeout} sec.")

    def _check_machines_reboot(self, boot_time_dict, reboot_state: dict, expected_stuck_machine_names=None):
        boot_time_dict_new = self.cluster.get_boot_time_dict(exclude_bastion=True)
        not_rebooted = []
        for m_name, m_rebooted in reboot_state.items():
            if m_rebooted:
                continue
            boot_time_old = dateutil.parser.isoparse(boot_time_dict[m_name])
            boot_time_new = dateutil.parser.isoparse(boot_time_dict_new[m_name])
            if boot_time_new > boot_time_old:
                reboot_state[m_name] = True
                continue
            not_rebooted.append(m_name)

        expected_status = "Ready"
        expected_stuck_machine_names = expected_stuck_machine_names or []
        machines_are_ready = False
        try:
            self.cluster.check._check_machines_conditions(expected_status,
                                                          expected_stuck_machine_names=expected_stuck_machine_names)
            machines_are_ready = True
        except (RuntimeError, ApiException, KeyError, TypeError, MaxRetryError):
            pass

        if len(not_rebooted) == 0:
            LOG.info('All requested machines are rebooted')
            if not machines_are_ready:
                LOG.info('Wait for "Ready" machines status')
                return False
            return True
        LOG.info(f"Following machines are not rebooted yet: {', '.join(not_rebooted)}")
        return False

    def wait_machines_reboot(self, machines_boot_time_dict, timeout=1200, interval=60,
                             expected_stuck_machine_names=None):
        """Wait for machines to be rebooted
        Args:
            machines_boot_time_dict: dict of LCM Machines: name -> boot time. Must be populated before
                                     requesting a reboot. Machines from the dict will be waited for reboot
                                     by polling for a newer boot time.
            timeout: timeout to wait
            interval: time between checks
        Returns: None
        """
        rebooted = {name: False for name in machines_boot_time_dict.keys()}
        waiters.wait(lambda: self._check_machines_reboot(machines_boot_time_dict, rebooted,
                                                         expected_stuck_machine_names),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Machine reboot timeout after {timeout} sec.")

    def _check_machines_migrated(self, expected_stuck_machine_names=None, target_runtime='containerd'):
        migrate_dict_new = self.cluster.get_runtime_dict(exclude_bastion=True)
        not_migrated = []
        for m_name, m_runtime in migrate_dict_new.items():
            if m_runtime == target_runtime:
                continue
            not_migrated.append(m_name)

        expected_status = "Ready"
        expected_stuck_machine_names = expected_stuck_machine_names or []
        machines_are_ready = False
        try:
            self.cluster.check._check_machines_conditions(expected_status,
                                                          expected_stuck_machine_names=expected_stuck_machine_names)
            machines_are_ready = True
        except (RuntimeError, ApiException, KeyError, TypeError, MaxRetryError):
            pass

        if len(not_migrated) == 0:
            LOG.info('All requested machines are migrated')
            if not machines_are_ready:
                LOG.info('Wait for "Ready" machines status')
                return False
            return True
        LOG.info(f"Following machines are not migrated yet: {', '.join(not_migrated)}")
        return False

    def wait_migration_stuck(self, timeout=1200, interval=60,
                             expected_stuck_machine_names=None, target_runtime='containerd'):
        """Wait for machines to be migrated from docker to containerd
        Args:
            machines_runtime_dict: dict of LCM Machines: name -> runtime. Must be populated before
                                     requesting a migration. Machines from the dict will be waited for migration
            timeout: timeout to wait
            interval: time between checks
            expected_stuck_machine_names: expected stuck machine name
        Returns: None
        """
        waiters.wait(lambda: self._check_machines_migrated(expected_stuck_machine_names, target_runtime),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Machines migration timeout after {timeout} sec.")

    def exec_ceph_tools_command(self, cmd: str, return_json=False, raise_on_fail=False):
        """
        Get Ceph cmd command result, from the ceph tools pod

        Returns: dict of status details

        """
        ceph_tools_pod = self.cluster.get_ceph_tool_pod()
        output = str()
        if return_json:
            cmd += ' -f json'
        try:
            output = ceph_tools_pod.exec(['/bin/sh', '-c', cmd])
        except Exception as ex:
            _msg = f"Failed to execute command {cmd}\n{str(ex)}"
            LOG.error(_msg)
            if raise_on_fail:
                raise Exception(_msg)
            if return_json:
                return dict()
            else:
                return output
        if return_json:
            try:
                parsed_output = yaml.safe_load(output)
                if not isinstance(parsed_output, dict):
                    LOG.error(f"Cannot convert command output to dictionary: {parsed_output}")
            except yaml.YAMLError:
                LOG.error(f"Failed to decode the output: {output}")
                return dict()
            return parsed_output
        else:
            return output

    def get_ceph_status_details(self) -> dict:
        """
        Get Ceph status directly from the ceph tools pod

        Returns: dict of status details

        """
        cmd_status = 'ceph -s'
        return self.exec_ceph_tools_command(cmd_status, return_json=True)

    def get_ceph_health_detail(self) -> dict:
        """
        Get Ceph health detail directly from the ceph tools pod

        Returns: dict of ceph health details

        """
        cmd_detail = 'ceph health detail'
        return self.exec_ceph_tools_command(cmd_detail, return_json=True)

    def get_kaascephstatus(self, name=None):
        ceph_data = self.cluster.get_kaascephcluster(name).data
        ceph_data_status = ceph_data.get('status') or {}
        return ceph_data_status

    def get_miracephstatus(self, name='rook-ceph'):
        ceph_data = self.cluster.get_miracephcluster(name).data
        ceph_data_status = ceph_data.get('status') or {}
        return ceph_data_status

    def get_miracephhealths_status(self, name='rook-ceph'):
        ceph_data = self.cluster.get_miracephhealth(name).data
        ceph_data_status = ceph_data.get('status') or {}
        return ceph_data_status

    def get_miracephhealths_health_status(self):
        """
        Get Ceph Cluster health from MiraCephHealth object
        """
        return self.get_miracephhealths_status().get(
            'fullClusterStatus', {}).get('clusterStatus', {}).get('ceph', {}).get('health')

    def get_ceph_health_status(self):
        """
        Get Ceph Cluster health from kaascephcluster object
        """
        return self.get_kaascephstatus().get(
            'fullClusterInfo', {}).get('clusterStatus', {}).get('ceph', {}).get('health')

    def get_ceph_phase(self):
        """
        Get Ceph Cluster reconcile phase from kaascephcluster object
        """
        return self.get_kaascephstatus().get(
            'fullClusterInfo', {}).get('clusterStatus', {}).get('phase')

    def get_ceph_state(self):
        """
        Get Ceph Cluster common status from kaascephcluster object
        """
        return self.get_kaascephstatus().get(
            'fullClusterInfo', {}).get('clusterStatus', {}).get('state')

    def wait_ceph_health_status(self, expected_status='HEALTH_OK', timeout=600,
                                interval=30):

        # do not fail on HEALTH_OK waiting if we have ceph-exporter crashed on host
        # warning - just clean up this message to proceed
        # https://mirantis.jira.com/browse/PRODX-42908
        def ceph_health_status(expected):
            ceph_health_details = self.get_ceph_health_detail()
            if expected == 'HEALTH_OK' and ceph_health_details['status'] == 'HEALTH_WARN':
                current_checks = ceph_health_details.get('checks')
                if current_checks.get('RECENT_CRASH'):
                    for detail in current_checks.get('RECENT_CRASH').get('detail', []):
                        if 'ceph-exporter crashed on host' in detail.get('message', ''):
                            self.exec_ceph_tools_command('ceph crash archive-all',
                                                         return_json=False,
                                                         raise_on_fail=True)
                            break
            return ceph_health_details['status']

        waiters.wait(
            lambda: ceph_health_status(expected_status) == expected_status,
            timeout=timeout, interval=interval,
            timeout_msg=f"Ceph status failed to became "
                        f"{expected_status} in {timeout} sec")

    def get_kaascephcluster_state(self, name=None):
        return self.get_kaascephstatus(name).get('shortClusterInfo', {}).get('state')

    def get_miraceph_phase(self, name='rook-ceph'):
        return self.get_miracephstatus(name).get('phase')

    def get_kaascephstate_message(self, name=None):
        return self.get_kaascephstatus(name).get('kaasCephState', '')

    def check_child_ceph_cluster(self,
                                 wait_status='Created',
                                 check_health=False):
        """
        self.cluster = child cluster object
        """
        try:
            # switch to child KUBECONFIG api
            rookceph = self.cluster.k8sclient.rookcephclusters.get(
                name='rook-ceph', namespace='rook-ceph').data or {}
            rookceph_status = rookceph.get('status') or {}
            rookcephstate = rookceph_status.get('state', 'NotExist')
            LOG.info("Rookceph status is {}".format(rookcephstate))
            if wait_status != rookcephstate:
                return False
            if check_health:
                rookceph_health = rookceph_status.get('ceph', {}).get('health', 'NotExist')
                rookceph_health_detail = rookceph.get(
                    'status', {}).get('ceph', {}).get('details', '')
                LOG.info(f'Rookceph health is: {rookceph_health}\n'
                         f'Details: {rookceph_health_detail}')
                if rookceph_health not in ['HEALTH_OK', 'HEALTH_WARN']:
                    return False
            return True
        except Exception:
            LOG.warning('rook-ceph cluster does not exist')
            return False

    def get_ceph_nodes(self, data, ns, _bm_nodes):
        LOG.info("Getting CEPH nodes")
        if self.cluster.workaround.skip_kaascephcluster_usage():
            c_nodes_data = []
        else:
            c_nodes_data = {}
        if data.get('cephClusterMapping', False) != 'profiled':
            return c_nodes_data
        # Collect all machines, that supposed to be used in ceph
        for_storage_exp = ns._get_storage_machines()
        # Check, that we have mapping for those node
        for machine in for_storage_exp:
            hwid = machine['spec']['providerSpec']['value'].get(
                'hostSelector', {}).get('matchLabels', {}).get(
                'kaas.mirantis.com/baremetalhost-id', False)
            if not hwid:
                LOG.info(f"Machine {machine['metadata']['name']} "
                         f"not supposed "
                         f"to be used in ceph,since don't have"
                         f"'kaas.mirantis.com/baremetalhost-id' hostSelector,"
                         f"continue..")
                continue
            for _node in _bm_nodes:
                if _node.get('name') == hwid and _node.get(
                        'ceph_cluster_node'):
                    LOG.info(f"Found custom ceph mapping:\n"
                             f"{_node['ceph_cluster_node']}\n"
                             f"for node: {_node['name']}")
                    if self.cluster.workaround.skip_kaascephcluster_usage():
                        _node.get('ceph_cluster_node')["name"] = machine.get('status', {}).get(
                            'instanceName', {})
                        c_nodes_data.append(_node.get('ceph_cluster_node'))
                    else:
                        c_nodes_data[machine['metadata']['name']] = \
                            _node.get('ceph_cluster_node')
        LOG.info(f"Ceph node data for cluster:\n{c_nodes_data}")
        return c_nodes_data

    def wait_miraceph_phase(self, expected_phase='Ready', timeout=1800, interval=60):
        waiters.wait(
            lambda: self.get_miraceph_phase() == expected_phase,
            timeout=timeout,
            interval=interval,
            timeout_msg=f"Timeout waiting for miraceph become "
                        f"{expected_phase} state in {timeout} sec. Current "
                        f"state: "
                        f"{self.get_miraceph_phase()}")

    def wait_miracephhealth_state(self, expected_phase='Ready', timeout=1800, interval=60):
        waiters.wait(
            lambda: self.get_miracephhealths_status().get('state', '') == expected_phase,
            timeout=timeout,
            interval=interval,
            timeout_msg=f"Timeout waiting for miraceph become "
                        f"{expected_phase} state in {timeout} sec. Current "
                        f"state: "
                        f"{self.get_miracephhealths_status().get('state', '')}")

    def wait_kaascephcluster_state(self, expected_state='Ready',
                                   timeout=1800, interval=60):
        waiters.wait(
            lambda: self.get_kaascephcluster_state() == expected_state,
            timeout=timeout,
            interval=interval,
            timeout_msg=f"Timeout waiting for kaascephcluster become "
                        f"{expected_state} state in {timeout} sec. Current "
                        f"state: "
                        f"{self.get_kaascephcluster_state()}")

    def is_kaascephcluster_removed(self):
        if not version.parse(self.cluster.clusterrelease_version) >= version.parse('mosk-17-0-4-25-1'):
            LOG.info(f"This waiter is not valid for "
                     f"Cluster version: {self.cluster.clusterrelease_version}. Skip this check...")
            return True
        kaascephcluster = self.cluster.get_cephcluster()
        if kaascephcluster:
            LOG.info(f"Found kaascephcluster for child cluster {self.cluster.name}")
            return False
        return True

    def wait_kaascephcluster_removed(self, timeout=600, interval=20):

        waiters.wait(lambda: self.is_kaascephcluster_removed(),
                     timeout=timeout,
                     interval=interval)

    def is_default_storage_class_exists(self, name):
        try:
            storage_class = self.cluster.k8sclient.storageclass.read(name)
            LOG.info(f"Cluster default storage class {storage_class}")
            return True
        except Exception as e:
            LOG.debug(e)
            return False

    def wait_default_storage(self, ceph_cluster_pools, timeout=1800, interval=60):
        name = [f"{pool.get('name')}-{pool.get('deviceClass')}" for
                pool in ceph_cluster_pools if pool.get('default')]
        if not name:
            LOG.warning('Default Storage class not found')
            return
        waiters.wait(
            lambda: self.is_default_storage_class_exists(name[0]) is True,
            timeout=timeout,
            interval=interval,
            timeout_msg=f"Timeout waiting for default storage: {name}"
        )

    def get_cephosdremoverequest_phase(self, name, namespace):
        ceph_request = self.cluster.k8sclient.cephosdremoverequests.get(name=name, namespace=namespace)
        ceph_request_data = ceph_request.data
        ceph_request_status = ceph_request_data.get('status') or {}
        if not ceph_request_status:
            LOG.debug(f"Request object {name} is not ready yet. Object data:\n{yaml.dump(ceph_request_data)}")
            return None, None
        else:
            return ceph_request_status.get('phase', None)

    def get_cephperftestrequest_phase(self, name, namespace):
        ceph_request = self.cluster.k8sclient.cephperftestrequests.get(name=name, namespace=namespace)
        ceph_request_data = ceph_request.data
        ceph_request_status = ceph_request_data.get('status') or {}
        if not ceph_request_status:
            LOG.debug(f"Request object {name} is not ready yet. Object data:\n{yaml.dump(ceph_request_data)}")
            return None, None
        else:
            return ceph_request_status.get('phase', None)

    def get_cephoperationrequest_state(self, name, namespace, operation_key=None):
        ceph_request = self.cluster.k8sclient.kaas_cephoperationrequests.get(name=name, namespace=namespace)
        ceph_request_data = ceph_request.data
        ceph_request_status = ceph_request_data.get('status') or {}
        if not ceph_request_status:
            LOG.debug(f"Request object {name} is not ready yet. Object data:\n{yaml.dump(ceph_request_data)}")
            return None, None
        else:
            request_phase = ceph_request_status.get('phase', None)
            operation_phase = ceph_request_status.get(operation_key, {}).get('phase', None) if operation_key else None
            return request_phase, operation_phase

    def wait_cephperftestrequest_state(self, name, namespace, expected_request_phase='Finished',
                                       timeout=3000, interval=120):

        def status_message_func():
            request_phase = self.get_cephperftestrequest_phase(name, namespace)
            msg = f"Current request phase is: {request_phase}. Expected: {expected_request_phase}. "
            return msg
        waiters.wait(
            lambda: self.get_cephperftestrequest_phase(name=name, namespace=namespace) == expected_request_phase,
            timeout=timeout,
            interval=interval,
            status_msg_function=status_message_func)

    def wait_cephosdremoverequest_phase(self, name, namespace, expected_request_phase='Completed',
                                        timeout=3000, interval=120):

        def status_message_func():
            request_phase = self.get_cephosdremoverequest_phase(name, namespace)
            msg = f"Current request phase is: {request_phase}. Expected: {expected_request_phase}. "
            return msg
        waiters.wait(
            lambda: self.get_cephosdremoverequest_phase(name=name, namespace=namespace) == expected_request_phase,
            timeout=timeout,
            interval=interval,
            status_msg_function=status_message_func)

    def wait_cephoperationrequest_state(self, name, namespace, expected_request_phase='Completed',
                                        expected_operation_phase=None, operation_key=None,
                                        timeout=3000, interval=120):

        def status_message_func():
            overall_status = self.get_cephoperationrequest_state(name, namespace, operation_key=operation_key)
            request_phase = overall_status[0]
            operation_phase = overall_status[1]
            msg = f"Current request phase is: {request_phase}. Expected: {expected_request_phase}. "
            if operation_key:
                msg += (f"Current operation {operation_key} phase is: {operation_phase}. "
                        f"Expected: {expected_operation_phase}")
            return msg
        waiters.wait(
            lambda: self.get_cephoperationrequest_state(
                name=name, namespace=namespace, operation_key=operation_key) == (expected_request_phase,
                                                                                 expected_operation_phase),
            timeout=timeout,
            interval=interval,
            status_msg_function=status_message_func)

    def get_miracephhealths_daemons_status(self):
        return self.get_miracephhealths_status().get('fullClusterStatus', {}).get('daemonsStatus')

    def get_ceph_daemons_status(self):
        return self.get_kaascephstatus().get('fullClusterInfo', {}).get('daemonsStatus')

    def storageclass_exists(self, sc_prefix='kubernetes-'):
        storage_classes = \
            self.cluster.k8sclient.api_storage.list_storage_class().to_dict()[
                'items']
        return len([x for x in storage_classes if
                    x['metadata']['name'].startswith(sc_prefix)]) > 0

    def get_ceph_pvc_status(self, pvc_name, namespace):
        try:
            pvc = self.cluster.k8sclient.pvolumeclaims.get(name=pvc_name, namespace=namespace)
            pvc_status = pvc.data.get('status') or {}
            phase = pvc_status.get('phase', '')
            return phase
        except ApiException as e:
            LOG.error(f"Couldn't get pvc status for pvc {pvc_name} in namespace {namespace}."
                      f"Reason: {e} ")
            return None

    def wait_ceph_pvc_status(self, pvc_name, namespace, expected_status='Bound', timeout=600, interval=30):
        """
        Available statuses:
        'Bound', 'Lost', 'Pending'
        https://kubernetes.io/docs/reference/kubernetes-api/config-and-storage-resources/persistent-volume-claim-v1/#PersistentVolumeClaimStatus # noqa
        """
        msg = (f"Timeout waiting for pvc {pvc_name} status to be {expected_status}. "
               f"Current status: {self.get_ceph_pvc_status(pvc_name=pvc_name, namespace=namespace)}")
        waiters.wait(lambda: self.get_ceph_pvc_status(pvc_name=pvc_name, namespace=namespace) == expected_status,
                     timeout=timeout, interval=interval, timeout_msg=msg)

    def check_ceph_pvc(self,
                       base_image_repo=None,
                       cephfs_enabled=False,
                       sc_name='kubernetes-',
                       device_class=None):
        """
        Simple test for ceph pvc in child-cluster
        Scenario:
            1. Spawn pod with attached pv on each node
            2. Write some data to file in pv
            3. Detach and delete pod
            4. Spawn new pod and attach to old pv
            5. Check data content
            6. Resize pvc
            7. Delete pod; delete pvc; Delete ns
        """
        base_image_repo = base_image_repo or self.cluster.determine_mcp_docker_registry()
        mount_path = '/data/test'
        pvc_size = 1
        ns = f'test-ns-{utils.gen_random_string(6)}'

        machines = self.cluster.get_machines(machine_type='worker')
        disabled_machines = [m.name for m in machines if m.is_disabled()]
        machines = [m for m in machines if m.name not in disabled_machines]
        LOG.info(f"Following machines are disabled and will be skipped: {disabled_machines}")

        if cephfs_enabled:
            LOG.info("Ceph filesystem: ENABLED")
        LOG.info("Wait for Ceph StorageClass created")
        waiters.wait(lambda: self.storageclass_exists(sc_name),
                     timeout=10 * 60, interval=60)
        storage_classes = \
            self.cluster.k8sclient.api_storage.list_storage_class().to_dict()[
                'items']
        # Search storage class based on name
        storage_class = next(x for x in storage_classes if
                             x['metadata']['name'].startswith(sc_name))
        storage_class_name = storage_class['metadata']['name']
        if not device_class:
            storage_class_pool = storage_class['parameters']['pool']
            device_class = storage_class_pool.split('-')[1]
        LOG.info(f"Found Ceph filesystem's StorageClass {storage_class_name} "
                 f"with {device_class} device class")

        LOG.info('Check Ceph pvc')
        LOG.info(f'Found worker machines: {[o.name for o in machines]}')
        LOG.info(f'Create "{ns}" namespace')
        namespace = self.cluster.k8sclient.namespaces.create(
            name=ns,
            body={'metadata': {'name': ns}})

        pvc_names = []
        if cephfs_enabled:
            pvc_name = f"test-pv-claim-{storage_class_name}"
            self.create_pvc(ns=ns, pvc_name=pvc_name, pvc_size=pvc_size, storage_class=storage_class_name,
                            access_mode='ReadWriteMany')
            pvc_names.append(pvc_name)
        else:
            for n in machines:
                pvc_name = f"test-pv-claim-{n.uid}"
                self.create_pvc(ns=ns, pvc_name=pvc_name, pvc_size=pvc_size, storage_class=storage_class_name,
                                access_mode='ReadWriteOnce')
                pvc_names.append(pvc_name)

        try:
            pods = []
            for n in machines:
                if cephfs_enabled:
                    pvc_name = f"test-pv-claim-{storage_class_name}"
                else:
                    pvc_name = f"test-pv-claim-{n.uid}"
                node_labels = n.get_k8s_node().read().metadata.labels
                node_name = node_labels['kubernetes.io/hostname']
                pod = self.create_pod(pvc_name=pvc_name, name=f"test-pod-{n.uid}", node_name=node_name,
                                      mount_path=mount_path, ns=ns, base_image_repo=base_image_repo)
                pods.append(pod)

            for pod in pods:
                LOG.info(f'Waiting for pod {pod.name} is Running.')
                pod.wait_phase(phases='Running', timeout=30*60, interval=30)

            file_paths = {}
            expected_strings = {}
            for pod in pods:
                LOG.info(f"Create new file in attached pv of {pod.name} pod")
                file_path = f'{mount_path}/{utils.gen_random_string(4)}_test.txt'
                expected_string = utils.gen_random_string(10)
                pod.exec(
                    ['/bin/sh', '-c',
                     f'echo "{expected_string}" > {file_path}'])
                pod.check_file_exists(file_path)
                file_paths[pod.name] = file_path
                expected_strings[pod.name] = expected_string

            for pod in pods:
                LOG.info(f"Delete {pod.name} pod")
                pod.delete()

            pods = []
            for n in machines:
                if cephfs_enabled:
                    pvc_name = f"test-pv-claim-{storage_class_name}"
                else:
                    pvc_name = f"test-pv-claim-{n.uid}"
                node_labels = n.get_k8s_node().read().metadata.labels
                node_name = node_labels['kubernetes.io/hostname']
                pod = self.create_pod(pvc_name=pvc_name, name=f"test-pod-{n.uid}", node_name=node_name,
                                      mount_path=mount_path, ns=ns, base_image_repo=base_image_repo)
                pods.append(pod)

            for pod in pods:
                LOG.info(f"Waiting for pod {pod.name} is Running.")
                pod.wait_phase(phases='Running', timeout=30 * 60, interval=30)

            for pod in pods:
                LOG.info('Check file content')
                file_path = file_paths[pod.name]
                expected_string = expected_strings[pod.name]
                pod.check_file_exists(file_path)
                res = pod.exec(['/bin/sh', '-c', f'cat {file_path}'])
                assert res.rstrip() == expected_string, \
                    (f"Expected file in pod {pod.name} has unexpected content: "
                     f"expected: '{expected_string}', actual: {res.rstrip()}")

            # If baremetal and has special parameter https://mirantis.jira.com/browse/PRODX-16779
            if self.cluster.provider == utils.Provider.baremetal:
                storage_name = storage_class_name.split('-')[0]
                if self.cluster.workaround.skip_kaascephcluster_usage():
                    ceph_crd = self.cluster.get_miracephcluster()
                    ceph_pool = next((p for p in ceph_crd.data['spec']['pools'] if
                                      p['name'] == storage_name and p['deviceClass'] == device_class), {})
                else:
                    ceph_crd = self.cluster.get_kaascephcluster()
                    ceph_pool = next((p for p in ceph_crd.data['spec']['cephClusterSpec']['pools'] if
                                      p['name'] == storage_name and p['deviceClass'] == device_class), {})

                pod_pvc_map = {}
                pvc_pod_map = {}
                for n in machines:
                    pvc_name = f"test-pv-claim-{n.uid}"
                    if cephfs_enabled:
                        pvc_name = f"test-pv-claim-{storage_class_name}"
                    pod_pvc_map[f"test-pod-{n.uid}"] = pvc_name
                    pvc_pod_map[pvc_name] = f"test-pod-{n.uid}"

                if cephfs_enabled or ceph_pool.get('allowVolumeExpansion', False):
                    pvcs = {}
                    pvc_incr = pvc_size
                    if cephfs_enabled:
                        pvc_incr += 1
                        LOG.info(f'Resize pv to {pvc_incr} Gi')
                        pvc_template = templates.render_template(
                            settings.CEPH_PVC_YAML, {
                                'PVC_NAME': f"test-pv-claim-{storage_class_name}",
                                'PVC_SIZE': f'{pvc_incr}Gi',
                                'ACCESS_MODE': 'ReadWriteMany',
                                'STORAGE_CLASS_NAME': storage_class_name})
                        pvc_json_body = json.dumps(
                            yaml.load(pvc_template, Loader=yaml.SafeLoader))
                        pvc = self.cluster.k8sclient.pvolumeclaims.update(name=f"test-pv-claim-{storage_class_name}",
                                                                          namespace=ns,
                                                                          body=json.loads(
                                                                              pvc_json_body))
                        pvcs[pvc_incr] = pvc
                    else:
                        for pod in pods:
                            pvc_incr += 1
                            LOG.info(f'Resize pv to {pvc_incr} Gi')
                            pvc_template = templates.render_template(
                                settings.CEPH_PVC_YAML, {
                                    'PVC_NAME': pod_pvc_map[pod.name],
                                    'PVC_SIZE': f'{pvc_incr}Gi',
                                    'ACCESS_MODE': 'ReadWriteOnce',
                                    'STORAGE_CLASS_NAME': storage_class_name})
                            pvc_json_body = json.dumps(
                                yaml.load(pvc_template, Loader=yaml.SafeLoader))
                            pvc = self.cluster.k8sclient.pvolumeclaims.update(name=pod_pvc_map[pod.name],
                                                                              namespace=ns,
                                                                              body=json.loads(
                                                                                  pvc_json_body))
                            pvcs[pvc_incr] = pvc

                    for pod in pods:
                        LOG.info(f"Waiting for pod {pod.name} is Running.")
                        pod.wait_phase(phases='Running')

                    for pvc_incr, pvc in pvcs.items():
                        pvc_volume_name = pvc.data['spec']['volume_name']

                        LOG.info('Check new pv size')
                        pv = next(x for x in self.cluster.k8sclient.pvolumes.list_all() if
                                  x.data['spec']['claim_ref'] is not None
                                  and x.name == pvc_volume_name)
                        waiters.wait(
                            lambda: pv.data['spec']['capacity']['storage'] == f'{pvc_incr}Gi',
                            timeout=30, interval=5, timeout_msg=f"Expected pvc_size = {pvc_incr}Gi in pod "
                                                                f"{pvc_pod_map[pvc.name]} but we have "
                                                                f"{pv.data['spec']['capacity']['storage']}")
                else:
                    LOG.info(f'allowVolumeExpansion parameter not found or in FALSE state.'
                             f'Skip pvc resize test. Print ceph_pool: {ceph_pool}')
            else:
                LOG.info('Skip pvc resize test for non-BM provider')

            for pod in pods:
                LOG.info(f"Delete {pod.name} pod")
                pod.delete()

        finally:
            LOG.info('Delete pvcs')
            for pvc_name in pvc_names:
                pvc = self.cluster.k8sclient.pvolumeclaims.get(name=pvc_name, namespace=ns)
                pvc.delete()

            LOG.info('Delete namespace')
            namespace.delete()
            LOG.info('Check Ceph pvc done')

    def get_miraceph_default_pool(self):
        """
        Get default pool configuration from the MiraCeph cluster.
        Returns: dict of default pool configuration if one is found, otherwise None.
        """
        miraceph_cluster = self.cluster.get_miracephcluster()
        if miraceph_cluster:
            miraceph_pools = miraceph_cluster.data.get('spec', {}).get('pools', {})
            for pool in miraceph_pools:
                if pool['default']:
                    return pool
            LOG.warning("Cannot find default miraceph pool!")
            return None
        LOG.warning("Cluster doesn't have miraceph cluster!")
        return None

    def get_rookcephstatus(self):
        return self.cluster.get_rookcephcluster().data.get('status') or {}

    def get_rook_ceph_health_status(self):
        """
        Get Ceph Cluster health from rookcephcluster object
        """
        return self.get_rookcephstatus().get('ceph', {}).get('health', {})

    def wait_rook_ceph_health_status(self, expected_status='HEALTH_OK', timeout=2400,
                                     interval=30):
        waiters.wait(
            lambda: self.get_rook_ceph_health_status() == expected_status,
            timeout=timeout, interval=interval,
            timeout_msg=f"Ceph status failed to became "
                        f"{expected_status} in {timeout} sec")

    def miracephfs_enabled(self):
        """
        Checks if Ceph filesystem is enabled in the MiraCeph cluster.
        Returns: dict of CephFS configuration if enabled, otherwise None.
        """
        miraceph_cluster = self.cluster.get_miracephcluster()
        if miraceph_cluster:
            return miraceph_cluster.data.get('spec', {}).get('sharedFilesystem', {}).get('cephFS', {})
        LOG.warning("Cluster doesn't have miraceph cluster")
        return None

    def check_k8s_nodes(self, timeout=360, interval=10):
        LOG.info("Checking k8s nodes status")
        try:
            waiters.wait(
                lambda: self.k8sclient.nodes.all_ready(),
                timeout=timeout, interval=interval)
        except exceptions.TimeoutError:
            nodes = {key: value for key, value in
                     self.k8sclient.nodes.list_statuses().items()
                     if value != 'True'}
            err = f"Timeout waiting for nodes to be Ready. " \
                  f"After {timeout} sec next nodes are not Ready: {nodes}"
            raise exceptions.TimeoutError(err)
        LOG.info("All k8s nodes are Ready")

    def wait_k8s_node_status(self, nodename='', expected_status='',
                             timeout=360, interval=10):
        LOG.info(f"Wait for k8s node stay in {expected_status} status.")
        try:
            waiters.wait(
                lambda: self.k8sclient.nodes.node_status(nodename,
                                                         expected_status),
                timeout=timeout, interval=interval,
                timeout_msg=f"Node not in status {expected_status}")
        except exceptions.TimeoutError:
            err = f"Timeout waiting for node {nodename} to be "\
                  f"{expected_status} After {timeout} sec."
            raise exceptions.TimeoutError(err)
        LOG.info(f"k8s node {nodename} is in status <{expected_status}>.")

    # note: check_machines_status will do similar thing, but we still need this
    def check_cluster_nodes(self, timeout=1800, interval=10,
                            expected_state='Ready'):
        LOG.info("Checking cluster nodes status")
        try:
            def check_nodes_status():
                if expected_state != 'Ready':
                    return self.cluster.cluster_status.lower() == expected_state.lower()
                cluster_status = self.cluster.data.get('status') or {}
                cluster_nodes = cluster_status.get('providerStatus', {}).get('nodes', {})
                requested_nodes = int(cluster_nodes.get('requested', 0))
                if not requested_nodes:
                    return False
                ready_nodes = int(cluster_nodes.get('ready', 0))
                disabled_nodes = int(cluster_nodes.get('disabled', 0))
                if disabled_nodes > 0:
                    LOG.info(f"Cluster has {disabled_nodes} disabled nodes, they are not counted as unready")
                    return requested_nodes == ready_nodes + disabled_nodes
                else:
                    return self.cluster.cluster_status.lower() == expected_state.lower()

            waiters.wait(lambda: check_nodes_status(),
                         timeout=timeout, interval=interval)
            LOG.info(f"All cluster nodes are {expected_state}")
        except exceptions.TimeoutError:
            status = self.cluster.data['status']
            if not status:
                raise Exception("Cluster is in Pending status")
            nodes = status.get('providerStatus', {}).get('nodes', {})
            if not nodes:
                raise Exception(f"Nodes for cluster are not found {status}")
            err = f"Timeout waiting for cluster " \
                  f"nodes to be {expected_state} " \
                  f"after {timeout} sec: {nodes}"
            raise exceptions.TimeoutError(err)

    def check_pods_number(self, pod_name_prefix, ns, pods):
        """Wait till replica number restores"""
        def correct_number_of_pods(pods):
            # let's take number of pods we had
            number = len(pods)
            new_list = self.k8sclient.pods.list(
                ns, field_selector="status.phase=Running"
            )
            # check if new list has same number of pods (with same names)
            new_pods_len = len([x for x in new_list if pod_name_prefix in x.name])
            return True if number == new_pods_len else False
        try:
            waiters.wait(
                lambda: correct_number_of_pods(pods),
                timeout=360, interval=10)
        except exceptions.TimeoutError:
            actual_num = len(self.k8sclient.pods.list_starts_with(
                pod_name_prefix, ns, field_selector="status.phase=Running"))
            LOG.error(f'Timeout waiting for correct number. '
                      f'Actual number: {actual_num}')
            raise TimeoutError

    def _get_replicasets_by_namespace(self, target_namespaces):
        rs_info = {}
        if isinstance(target_namespaces, str):
            target_namespaces = [target_namespaces]
        all_rs = self.k8sclient.replicasets.list_all()
        for rs in all_rs:
            if target_namespaces and rs.namespace not in target_namespaces:
                continue
            rs.kind = 'ReplicaSet'  # ReplicaSet objects in list have 'kind: None' until rs.read()
            rs.spec = rs._read_cache.spec  # spec in _read_cache from ReplicaSetList contains required values
            rs_info.setdefault(rs.namespace, {})[f"{rs.kind}/{rs.uid}"] = rs
        return rs_info

    def check_k8s_pods(self, phases=('Running', 'Succeeded'),
                       target_namespaces=None,
                       timeout=settings.WAIT_PODS_READY_TIMEOUT,
                       interval=30, pods_prefix=''):
        """Wait till all expected pods for cluster are in specified
           phase and have Ready=True for all containers
        Args:
            phases: list of expected pod phases
            target_namespaces: namespace (str) or namespaces (list)
                               where pods should be checked
            timeout: timeout to wait
            interval: time between checks
        """

        def wait_for_running_pods(pause=60):
            """Get the pods statuses and compare the restart counts

            :param pause: int, seconds, pause between restart checks
            """
            LOG.info("Check k8s pods status")
            rs_info = self._get_replicasets_by_namespace(target_namespaces)
            # 404 appeared when a just created Management cluster don't have /clusters api-resource yet
            # or sometimes resource can disappear at checking procedure, and we will receive 404
            try:
                pods_info = self.k8sclient.pods.check_pods_statuses(
                    target_namespaces=target_namespaces, phases=phases,
                    excluded_pods=self.EXCLUDED_PODS,
                    pods_prefix=pods_prefix, replicasets=rs_info)
                if not pods_info:
                    return False
            except ApiException as ex:
                if ex.status != 404:
                    raise ex
                else:
                    LOG.error(f"Error happened while checking pods statuses: {ex}")
                    return False

            LOG.info(f"Wait for {pause} seconds before the second check")
            time.sleep(pause)

            LOG.info("Check k8s pods status one more time")
            rs_info = self._get_replicasets_by_namespace(target_namespaces)
            try:
                pods_info_new = self.k8sclient.pods.check_pods_statuses(
                    target_namespaces=target_namespaces, phases=phases,
                    excluded_pods=self.EXCLUDED_PODS,
                    pods_prefix=pods_prefix, replicasets=rs_info)
                if not pods_info_new:
                    return False
            except ApiException as ex:
                if ex.status != 404:
                    raise ex
                else:
                    LOG.error(f"Error happened while checking pods statuses: {ex}")
                    return False

            result = True
            for pod, info in pods_info.items():
                if pod not in pods_info_new:
                    LOG.warning(f"Pod {pod} disappeared after {pause} seconds")
                    result = False
                    continue
                cont_restarts_new = pods_info_new[pod]['containers_restarts']
                for container, restarts in info['containers_restarts'].items():
                    if container not in cont_restarts_new:
                        LOG.warning(f"Container {container} from pod {pod} "
                                    f"disappeared after {pause} seconds")
                        result = False
                        continue
                    restarts_new = cont_restarts_new[container]
                    if restarts_new > restarts:
                        LOG.warning(f"Container {container} from pod {pod} "
                                    f"has been restarted after {pause} seconds"
                                    f", current restarts: {restarts_new}")
                        result = False

            if result:
                LOG.info("All pods are in correct state")
            return result

        if not target_namespaces:
            target_namespaces = self.cluster.expected_pods.keys()
        LOG.info(f"Checking k8s pods phase and containers status in namespaces {target_namespaces}")
        try:
            # Wait for pods status
            waiters.wait(wait_for_running_pods,
                         timeout=timeout, interval=interval)

            # Wait for jobs
            waiters.wait(
                lambda: self.k8sclient.jobs.check_jobs_completed(
                    target_namespaces=target_namespaces,
                    excluded_jobs=self.EXCLUDED_JOBS,
                    jobs_prefix=pods_prefix),
                timeout=timeout, interval=interval)
        except exceptions.TimeoutError:
            pods = self.k8sclient.pods.list_raw().to_dict()['items']
            if target_namespaces:
                if isinstance(target_namespaces, str):
                    target_namespaces = [target_namespaces]
                pods = [pod for pod in pods
                        if pod['metadata']['namespace'] in target_namespaces]
            utils.print_pods_status(pods)
            failed_pods = {}
            ready_text = " and Ready=True for " \
                         "each container"
            for pod, info in \
                    self.k8sclient.pods.get_pods_statuses(pods).items():
                # Check for the expected pod phase
                if info['phase'] not in phases:
                    failed_pods[pod] = info
                # Check for containers status inside the non-completed pod
                if (False in info['containers'].values()
                        and info['phase'] not in 'Succeeded'):
                    failed_pods[pod] = info
            err = f"Timeout waiting for pods statuses. " \
                  f"After {timeout} sec next pods are not in {phases} phase" \
                  f"{ready_text}: {failed_pods}"

            raise TimeoutError(err)
        LOG.info("All pods and jobs are in correct state")

    def check_ha_kill_proc_pods(self, pod_group, command):
        """Kill processed in every container in all cluster pods
        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every namespace and
        expected pod entry in get_expected_pods
        Note: 'kill -- - 1' will kill all processes except pid 1

        Scenario:
            1. Iterate by each pod in a group
            2. Iterate by each container in a pod and
               kill processes in the container
            3. Wait till number of replicas will be restored
            4. Check pods and container statuses in this group
               (Running and Ready)
            5. Check all pods phases (must be Running or Completed)
               (in fixture)

        Expected result - pods are recreated, number of replicas is restored.
        """
        pod_name, pod_num, pods = pod_group

        ns = [pod.namespace for pod in pods][0]
        LOG.info("Iterate by each pod in a group")
        for pod in pods:
            # let's reread every pod to make sure it is healthy
            # before killing containers
            pod_data = pod.read()
            assert False not in [
                x.ready for x in pod_data.status.container_statuses], \
                f"Some containers are not ready for {pod.name} " \
                f"pod: {pod_data.status.container_statuses}"
            LOG.info(f"Killing services in "
                     f"all containers of the pod: {pod.name}")
            for cont in pod_data.status.container_statuses:
                restarts = cont.restart_count
                LOG.info(f"Execute command {command} "
                         f"in {cont.name} container. "
                         f"Restarts: {restarts}")
                try:
                    result = pod.exec(['/bin/sh', '-c', command],
                                      container=cont.name)
                    if result:
                        LOG.info(f"Killed with result: {result}")
                except (ApiException, WebSocketBadStatusException) as e:
                    LOG.error(e)
                continue
        LOG.info("Check number of pods")
        self.check_pods_number(pod_name, ns, pods)
        LOG.info(f"Check status for {pod_name} pods")
        # PRODX-12786
        if 'kibana' in pod_name:
            time.sleep(30)
            self.check_k8s_pods(pods_prefix=pod_name,
                                timeout=600,
                                interval=30)
        else:
            self.check_k8s_pods(pods_prefix=pod_name,
                                timeout=300,
                                interval=30)

    def get_leader(self, app_prefix):
        app_leader_name = app_prefix + '-leader-election'
        LOG.info("Get leader data from Lease")
        lease = l[0] if (l := self.k8sclient.leases.list_all(name_prefix=app_leader_name)) else None
        if not lease:
            raise Exception(f"Cannot find Lease for {app_prefix}")
        holder_identity = lease.data.get('spec', {}).get('holderIdentity')
        leader_pod_name = holder_identity.split("_")[0]

        LOG.info(f"Current leader for app {app_prefix} is {leader_pod_name}")
        return leader_pod_name

    def check_ha_delete_leader_pod(self, app_group=None):
        """Detect and delete leader pod for selected app. Check that leader changed
        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for selected application

        Scenario:
            1. Detect leader
            2. Delete leader pod
            3. Wait till leader was changed

        Expected result - pods are recreated, number of replicas is restored.
        """
        pods = self.cluster.k8sclient.pods.list_all(name_prefix=app_group)
        assert len(pods) > 0, "Pods not found for selected app"
        leader_pod_name_before = self.get_leader(app_prefix=app_group).split("_")[0]

        LOG.info("Get leader pod and delete it")
        for pod in pods:
            if pod.name == leader_pod_name_before:
                LOG.info(f"Deleting leader pod: {pod.name}")
                pod.delete(timeout=180)

        def wait_leader_changed(leader_before=None):
            leader_pod_name_after = self.get_leader(app_prefix=app_group).split("_")[0]
            if leader_pod_name_after != leader_before:
                return True
            else:
                LOG.info(f"Leader is not changed yet. Should be not {leader_before}")
                return False

        waiters.wait(lambda: wait_leader_changed(leader_pod_name_before),
                     timeout=120, interval=5)

    def check_ha_delete_pods(self, pod_group):
        """Delete all pods in any cluster one by one
         Precondition - all expected pods and their replicas must be presented
         The following scenario is executed for every namespace and
         expected pod entry in get_expected_pods

         Scenario:
             1. Compare actual number of replicas for pod with expected
             2. Iterate by each replica
             3. Delete pod (1 replica)
             4. Wait till number of replicas will be restored
             5. Check pods statuses in this group (Running and Ready)

         Expected result - pods are recreated, number of replicas is restored.
         """
        pod_name, pod_num, pods = pod_group
        ns = [pod.namespace for pod in pods][0]
        LOG.info("Deleting pods")
        for pod in pods:
            LOG.info(f"Deleting pod: {pod.name}")
            pod.delete(timeout=180)
            # special handling of mariadb-server StatefulSets
            if 'mariadb-server' in pod.name:
                LOG.info(f"Checking readiness "
                         f"for {pod.name} (StatefulSet)")
                pod.wait_ready()
            LOG.info("Check number of pods")
            self.check_pods_number(pod_name, ns, pods)
            LOG.info(f"Check status for {pod_name} pods")
            self.check_k8s_pods(pods_prefix=pod_name,
                                timeout=300,
                                interval=30)

    def check_ha_kill_mariadb_cluster(self):
        """Kill mysqld service on every cluster node (one at a time)

        Executed only on MOSK cluster

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. SSH to node
            2. Kill mysqld service
            3. Check mariadb become ready
            4. Check that all pods are Running and Ready

        Expected result - all mysqld services restored successfully.
        """
        cluster_version = self.cluster.clusterrelease_version
        if 'mosk' in cluster_version:
            nodes = self.cluster.get_machines(k8s_labels={"openstack-control-plane": "enabled"})
            for node in nodes:
                LOG.info(f"Accessing {node.name}")
                cont_names = node._run_cmd(
                    "docker ps --format '{{.Names}}' | grep -E '^k8s_mariadb_mariadb-server-.?_openstack'",
                    ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
                for cont_name in cont_names.splitlines():
                    kill_process_in_container_by_word(node, cont_name, grep_word='mysqld')
                    LOG.info("Waiting for pods to be in a correct state")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="openstack")
                LOG.info("Wait couple minutes for mariadb to complete SST")
                time.sleep(200)
        else:
            LOG.info("\nHA tests to kill mariadb were skipped "
                     "as child cluster isn't MOSK\n")
            pytest.skip("HA tests to kill mariadb were skipped")

    def check_ha_kill_memcached_cluster(self):
        """Kill memcached service on every cluster node (one at a time)

        Executed only on MOSK cluster

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. SSH to node
            2. Kill memcached service
            3. Check memcached become ready
            4. Check that all pods are Running and Ready

        Expected result - all memcached services restored successfully.
        """
        cluster_version = self.cluster.clusterrelease_version
        if 'mosk' in cluster_version:
            nodes = self.cluster.get_machines(k8s_labels={"openstack-control-plane": "enabled"})
            for node in nodes:
                LOG.info(f"Accessing {node.name}")
                cont_names = node._run_cmd(
                    "docker ps --format '{{.Names}}' | "
                    "grep -E '^k8s_memcached_openstack-memcached-memcached-.?_openstack'",
                    ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
                for cont_name in cont_names.splitlines():
                    kill_process_in_container_by_word(node, cont_name, grep_word='memcached')
                    LOG.info("Waiting for pods to be in a correct state")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces='openstack')
        else:
            LOG.info("\nHA tests to kill memcached were skipped "
                     "as child cluster isn't MOSK\n")
            pytest.skip("HA tests to kill memcached were skipped")

    def check_ha_kill_etcd_cluster(self):
        """Kill etcd service on every cluster node (one at a time)

        Executed only on MOSK cluster

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. SSH to node
            2. Kill etcd service
            3. Check etcd become ready
            4. Check that all pods are Running and Ready

        Expected result - all etcd services restored successfully.
        """
        cluster_version = self.cluster.clusterrelease_version
        if 'mosk' in cluster_version:
            nodes = self.cluster.get_machines(k8s_labels={"openstack-control-plane": "enabled"})
            for node in nodes:
                LOG.info(f"Accessing {node.name}")
                cont_names = node._run_cmd(
                    "docker ps --format '{{.Names}}' | grep -E '^k8s_etcd_etcd-etcd-.?_openstack'",
                    ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
                for cont_name in cont_names.splitlines():
                    kill_process_in_container_by_word(node, cont_name, grep_word='etcd')
                    LOG.info("Waiting for pods to be in a correct state")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="openstack")
        else:
            LOG.info("\nHA tests to kill etcd were skipped "
                     "as child cluster isn't MOSK\n")
            pytest.skip("HA tests to kill etcd were skipped")

    def check_ha_kill_rabbitmq(self):
        """Kill rabbtimq service

        Executed only on MOSK cluster

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. Found node where rabbitmq is running
            2. SSH to node
            3. Kill beam.smp service
            4. Check rabbtimq become ready
            5. Check that all pods are Running and Ready

        Expected result - rabbtimq services restored successfully.
        """
        cluster_version = self.cluster.clusterrelease_version
        if 'mosk' in cluster_version:
            pod_name = "openstack-rabbitmq-rabbitmq-0"
            rq_pod = self.cluster.k8sclient.pods.get(pod_name, namespace="openstack")
            node = self.cluster.get_machine_by_k8s_name(rq_pod.node_name)
            cont_names = node._run_cmd(
                "docker ps --format '{{.Names}}' | grep -E '^k8s_rabbitmq_openstack-rabbitmq-rabbitmq-0_openstack'",
                ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
            for cont_name in cont_names.splitlines():
                kill_process_in_container_by_word(node, cont_name, grep_word='beam.smp')
                LOG.info("Waiting for rabbitmq to be in a correct state")
                self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="openstack")
        else:
            LOG.info("\nHA tests to kill rabbitmq were skipped "
                     "as child cluster isn't MOSK\n")
            pytest.skip("HA tests to kill rabbitmq were skipped")

    def check_ha_kill_neutron_rabbitmq(self):
        """Kill neutron rabbtimq service

        Executed only on MOSK cluster

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. Found node where rabbitmq is running
            2. SSH to node
            3. Kill beam.smp service
            4. Check rabbtimq become ready
            5. Check that all pods are Running and Ready

        Expected result - rabbtimq services restored successfully.
        """
        cluster_version = self.cluster.clusterrelease_version
        if 'mosk' in cluster_version:
            pod_name = "openstack-neutron-rabbitmq-rabbitmq-0"
            rq_pod = self.cluster.k8sclient.pods.get(pod_name, namespace="openstack")
            node = self.cluster.get_machine_by_k8s_name(rq_pod.node_name)
            cont_names = node._run_cmd(
                "docker ps --format '{{.Names}}' | "
                "grep -E '^k8s_rabbitmq_openstack-neutron-rabbitmq-rabbitmq-0_openstack'",
                ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
            for cont_name in cont_names.splitlines():
                kill_process_in_container_by_word(node, cont_name, grep_word='beam.smp')
                LOG.info("Waiting for rabbitmq to be in a correct state")
                self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="openstack")
        else:
            LOG.info("\nHA tests to kill neutron rabbtimq were skipped "
                     "as child cluster isn't MOSK\n")
            pytest.skip("HA tests to kill neutron rabbtimq were skipped")

    def check_ha_kill_neutron_ovs_agent(self):
        """Kill neutron ovs agent

        Executed only on MOSK cluster

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. Found nodes where neutron ovs agent is running
            2. SSH to node
            3. Kill neutron-openvswitch-agent service
            4. Check neutron ovs agent become ready
            5. Check that all pods are Running and Ready

        Expected result - rabbtimq services restored successfully.
        """
        cluster_version = self.cluster.clusterrelease_version
        if 'mosk' in cluster_version:
            nodes = self.cluster.get_machines(
                k8s_labels={"openvswitch": "enabled"})
            for node in nodes:
                LOG.info(f"Accessing {node.name}")
                cont_names = node._run_cmd(
                    "docker ps --format '{{.Names}}' | grep -E '^k8s_neutron-ovs-agent_neutron-ovs-agent'",
                    ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
                for cont_name in cont_names.splitlines():
                    kill_process_in_container_by_word(node, cont_name, grep_word='neutron-openvswitch-agent')
                    LOG.info("Waiting for pods to be in a correct state")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="openstack")
        else:
            LOG.info("\nHA tests to kill neutron-openvswitch-agent were skipped "
                     "as child cluster isn't MOSK\n")
            pytest.skip("HA tests to kill neutron-openvswitch-agent were skipped")

    def check_ha_kill_neutron_dhcp_agent(self):
        """Kill neutron dhcp agent

        Executed only on MOSK cluster

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. Found nodes where neutron dhcp agent is running
            2. SSH to node
            3. Kill neutron-dhcp-agent service
            4. Check dhcp agent become ready
            5. Check that all pods are Running and Ready

        Expected result - neutron-dhcp-agent services restored successfully.
        """
        cluster_version = self.cluster.clusterrelease_version
        if 'mosk' in cluster_version:
            nodes = self.cluster.get_machines(
                k8s_labels={"openstack-gateway": "enabled"})
            for node in nodes:
                LOG.info(f"Accessing {node.name}")
                cont_names = node._run_cmd(
                    "docker ps --format '{{.Names}}' | grep -E '^k8s_neutron-dhcp-agent_neutron-dhcp-agent'",
                    ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
                for cont_name in cont_names.splitlines():
                    kill_process_in_container_by_word(node, cont_name, grep_word='neutron-dhcp-agent')
                    LOG.info("Waiting for pods to be in a correct state")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="openstack")
        else:
            LOG.info("\nHA tests to kill neutron-dhcp-agent were skipped "
                     "as child cluster isn't MOSK\n")
            pytest.skip("HA tests to kill neutron-dhcp-agent were skipped")

    def check_ha_kill_neutron_l3_agent(self):
        """Kill neutron l3 agent

        Executed only on MOSK cluster

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. Found nodes where neutron l3 agent is running
            2. SSH to node
            3. Kill neutron-l3-agent service
            4. Check l3 agent become ready
            5. Check that all pods are Running and Ready

        Expected result - l3 agent services restored successfully.
        """
        cluster_version = self.cluster.clusterrelease_version
        if 'mosk' in cluster_version:
            nodes = self.cluster.get_machines(
                k8s_labels={"openstack-gateway": "enabled"})
            for node in nodes:
                LOG.info(f"Accessing {node.name}")
                cont_names = node._run_cmd(
                    "docker ps --format '{{.Names}}' | grep -E '^k8s_neutron-l3-agent_neutron-l3-agent'",
                    ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
                for cont_name in cont_names.splitlines():
                    kill_process_in_container_by_word(node, cont_name, grep_word='neutron-l3-agent')
                    LOG.info("Waiting for pods to be in a correct state")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="openstack")
        else:
            LOG.info("\nHA tests to kill neutron l3 agent were skipped "
                     "as child cluster isn't MOSK\n")
            pytest.skip("HA tests to kill neutron l3 agent were skipped")

    def check_ha_kill_tf_control(self):
        """Kill contrail-control

        Executed only on MOSK cluster with TF

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. Found nodes where contrail-control is running
            2. SSH to node
            3. Kill contrail-control service
            4. Check l3 agent become ready
            5. Check that all pods are Running and Ready

        Expected result - contrail-control services restored successfully.
        """
        cluster_version = self.cluster.clusterrelease_version
        if 'mosk' in cluster_version and self.cluster.tf_enabled():
            nodes = self.cluster.get_machines(
                k8s_labels={"tfcontrol": "enabled"})
            for node in nodes:
                LOG.info(f"Accessing {node.name}")
                cont_names = node._run_cmd(
                    "docker ps --format '{{.Names}}' | grep -E '^k8s_control_tf-control'",
                    ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
                for cont_name in cont_names.splitlines():
                    kill_process_in_container_by_word(node, cont_name, grep_word='/usr/bin/contrail-control')
                    LOG.info("Waiting for pods to be in a correct state")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="tf")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="openstack")
        else:
            LOG.info("\nHA tests to kill tf control were skipped "
                     "as child cluster isn't MOSK with TF\n")
            pytest.skip("HA tests to kill tf control were skipped")

    def check_ha_kill_tf_kafka(self):
        """Kill tf kafka

        Executed only on MOSK cluster with TF

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. Found nodes where tf kafka is running
            2. SSH to node
            3. Kill tf kafka service
            4. Check l3 agent become ready
            5. Check that all pods are Running and Ready

        Expected result - tf kafka services restored successfully.
        """

        cluster_version = self.cluster.clusterrelease_version
        if ('mosk' in cluster_version and self.cluster.tf_enabled()
                and self.cluster.tf_analytics_enabled()):
            nodes = self.cluster.get_machines(
                k8s_labels={"tfanalyticsdb": "enabled"})
            for node in nodes:
                LOG.info(f"Accessing {node.name}")
                cont_names = node._run_cmd(
                    "docker ps --format '{{.Names}}' | grep -E '^k8s_kafka-broker_tf-kafka'",
                    ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
                for cont_name in cont_names.splitlines():
                    kill_process_in_container_by_word(node, cont_name, grep_word='/etc/kafka/kafka.properties')
                    LOG.info("Waiting for pods to be in a correct state")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="tf")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="openstack")
        else:
            LOG.info("\nHA tests to kill tf kafka were skipped "
                     "as child cluster isn't MOSK with TF or TF Analytics is disabled\n")
            pytest.skip("HA tests to kill tf kafka were skipped ")

    def check_ha_kill_tf_zookeeper(self):
        """Kill tf zookeeper

        Executed only on MOSK cluster with TF

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. Found nodes where tf zookeeper is running
            2. SSH to node
            3. Kill tf zookeeper service
            4. Check tf zookeeper agent become ready
            5. Check that all pods are Running and Ready

        Expected result - tf zookeeper services restored successfully.
        """
        cluster_version = self.cluster.clusterrelease_version
        if 'mosk' in cluster_version and self.cluster.tf_enabled():
            nodes = self.cluster.get_machines(
                k8s_labels={"tfconfigdb": "enabled"})
            for node in nodes:
                LOG.info(f"Accessing {node.name}")
                cont_names = node._run_cmd(
                    "docker ps --format '{{.Names}}' | grep -E '^k8s_zookeeper_tf-zookeeper-.?_tf'",
                    ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
                for cont_name in cont_names.splitlines():
                    kill_process_in_container_by_word(node, cont_name, grep_word='/data/conf/zoo.cfg')
                    LOG.info("Waiting for pods to be in a correct state")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="tf")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="openstack")
        else:
            LOG.info("\nHA tests to kill tf zookeeper were skipped "
                     "as child cluster isn't MOSK with TF\n")
            pytest.skip("HA tests to kill tf zookeeper were skipped ")

    def check_ha_kill_tf_zookeeper_nal(self):
        """Kill tf zookeeper-nal

        Executed only on MOSK cluster with TF

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. Found nodes where tf zookeeper-nal is running
            2. SSH to node
            3. Kill tf zookeeper-nal service
            4. Check tf zookeeper-nal agent become ready
            5. Check that all pods are Running and Ready

        Expected result - tf zookeeper-nal services restored successfully.
        """

        cluster_version = self.cluster.clusterrelease_version
        if ('mosk' in cluster_version and self.cluster.tf_enabled()
                and self.cluster.tf_analytics_enabled()):
            nodes = self.cluster.get_machines(
                k8s_labels={"tfanalyticsdb": "enabled"})
            for node in nodes:
                LOG.info(f"Accessing {node.name}")
                cont_names = node._run_cmd(
                    "docker ps --format '{{.Names}}' | grep -E '^k8s_zookeeper_tf-zookeeper-nal-.?_tf'",
                    ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
                for cont_name in cont_names.splitlines():
                    kill_process_in_container_by_word(node, cont_name, grep_word='/data/conf/zoo.cfg')
                    LOG.info("Waiting for pods to be in a correct state")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="tf")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="openstack")
        else:
            LOG.info("\nHA tests to kill tf zookeeper-nal were skipped "
                     "as child cluster isn't MOSK with TF or TF Analytics is disabled\n")
            pytest.skip("HA tests to kill tf zookeeper-nal were skipped ")

    def check_ha_kill_tf_rabbitmq(self):
        """Kill tf rabbitmq

        Executed only on MOSK cluster with TF

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. Found nodes where tf rabbitmq is running
            2. SSH to node
            3. Kill tf rabbitmq service
            4. Check tf rabbitmq agent become ready
            5. Check that all pods are Running and Ready

        Expected result - tf rabbitmq services restored successfully.
        """
        cluster_version = self.cluster.clusterrelease_version
        if 'mosk' in cluster_version and self.cluster.tf_enabled():
            nodes = self.cluster.get_machines(
                k8s_labels={"tfconfigdb": "enabled"})
            for node in nodes:
                LOG.info(f"Accessing {node.name}")
                cont_names = node._run_cmd(
                    "docker ps --format '{{.Names}}' | grep -E '^k8s_rabbitmq_tf-rabbitmq-.?_tf'",
                    ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
                for cont_name in cont_names.splitlines():
                    kill_process_in_container_by_word(node, cont_name, grep_word='beam.smp')
                    LOG.info("Waiting for pods to be in a correct state")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="tf")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="openstack")
        else:
            LOG.info("\nHA tests to kill tf rabbitmq were skipped "
                     "as child cluster isn't MOSK with TF\n")
            pytest.skip("HA tests to kill tf rabbitmq were skipped")

    def check_ha_kill_tf_redis(self):
        """Kill tf redis

        Executed only on MOSK cluster with TF

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. Found nodes where tf redis is running
            2. SSH to node
            3. Kill tf redis service
            4. Check tf redis agent become ready
            5. Check that all pods are Running and Ready

        Expected result - tf redis services restored successfully.
        """
        cluster_version = self.cluster.clusterrelease_version
        if 'mosk' in cluster_version and self.cluster.tf_enabled():
            nodes = self.cluster.get_machines(
                k8s_labels={"tfanalyticsdb": "enabled"})
            for node in nodes:
                LOG.info(f"Accessing {node.name}")
                cont_names = node._run_cmd(
                    "docker ps --format '{{.Names}}' | grep -E '^k8s_redis_redis-tf-redis-.?_tf'",
                    ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
                for cont_name in cont_names.splitlines():
                    kill_process_in_container_by_word(node, cont_name, grep_word='redis-server')
                    LOG.info("Waiting for pods to be in a correct state")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="tf")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="openstack")
        else:
            LOG.info("\nHA tests to kill tf redis were skipped "
                     "as child cluster isn't MOSK with TF\n")
            pytest.skip("HA tests to kill tf redis were skipped")

    def check_ha_kill_libvirt(self):
        """Kill libvirt

        Executed only on MOSK cluster

        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. Found nodes where libvirt is running
            2. SSH to node
            3. Kill libvirt service
            4. Check libvirt agent become ready
            5. Check that all pods are Running and Ready

        Expected result - libvirt services restored successfully.
        """
        cluster_version = self.cluster.clusterrelease_version
        if 'mosk' in cluster_version:
            nodes = self.cluster.get_machines(
                k8s_labels={"openstack-compute-node": "enabled"})
            for node in nodes:
                LOG.info(f"Accessing {node.name}")
                cont_names = node._run_cmd(
                    "docker ps --format '{{.Names}}' | grep -E '^k8s_libvirt_libvirt-libvirt'",
                    ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
                for cont_name in cont_names.splitlines():
                    libvirt_pid = node._run_cmd(
                        f"docker exec {cont_name} cat /var/run/libvirtd.pid",
                        ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
                    node._run_cmd(f"sudo kill -9 {libvirt_pid}", ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
                    LOG.info("Waiting for pods to be in a correct state")
                    self.check_k8s_pods(timeout=1200, interval=30, target_namespaces="openstack")
        else:
            LOG.info("\nHA tests to kill libvirt were skipped "
                     "as child cluster isn't MOSK\n")
            pytest.skip("HA tests to kill libvirt were skipped")

    def check_ha_kill_system_proc_in_cont(self):
        """Kill main process in mke system containers
        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. SSH to each cluster node 1 by 1
            2. Collect list of container id's
            3. Kill containers 1 by 1
            4. Wait till all containers are running again
            5. Check that all pods are in a correct state

        Expected result - pods and their containers are Running and Ready.
        """
        # see PRODX-15408 for details
        cont_to_skip = ['ucp-kubectl',
                        'ucp-cluster-agent']
        nodes = self.cluster.get_machines()
        KEY_FILE = settings.HA_TEST_PRIVATE_KEY_FILE
        for node in nodes:
            LOG.info(f"Access {node.name}")
            cont_list = node._run_cmd("docker ps --format '{{.Names}}' |"
                                      " grep -v k8s",
                                      ssh_key=KEY_FILE).stdout_str.split("\n")
            LOG.debug(f"Containers: {cont_list}")
            LOG.debug("The state of the containers before the start of the test")
            containers_before_ha = node._run_cmd("docker ps | grep -v k8s",
                                                 ssh_key=KEY_FILE).stdout_str
            LOG.debug(f"{containers_before_ha}")
            LOG.info("Sending kill -9 for each container")
            for container in cont_list:
                _container = container.split(".")[0]
                if _container not in cont_to_skip:
                    node._run_cmd(f"id=`docker ps | grep {_container} |"
                                  f"awk {{'print $1'}}`;"
                                  f"docker kill --signal=9 $id",
                                  ssh_key=KEY_FILE)
                    cmd = f"docker ps | grep {_container} |  wc -l"
                    waiters.wait(lambda:
                                 node._run_cmd(cmd, ssh_key=KEY_FILE).
                                 stdout_str == '1', timeout=60, interval=5)
                else:
                    LOG.info(f"Container {_container} skipped because "
                             f"it is in a skip list")
            LOG.debug("The state of the containers after the start of the test")
            containers_after_ha = node._run_cmd("docker ps | grep -v k8s",
                                                ssh_key=KEY_FILE).\
                stdout_str
            LOG.debug(f"{containers_after_ha}")
            LOG.info("Waiting for pods")
            self.check_k8s_pods(timeout=1200, interval=30)

    def check_ha_kill_ucp_cluster_agent(self):
        """Kill ucp-cluster-agent container
        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. SSH to each cluster node 1 by 1
            2. Search for ucp-cluster-agent container
            3. Kill ucp-cluster-agent container
            4. Wait till container IS running
            5. Check that all pods are in a correct state
        Expected result - pods and container are Running and Ready.
        """

        def check_if_cluster_agent_is_running():
            for node in nodes:
                LOG.info(f"Access {node.name}")
                cont_count = \
                    node._run_cmd(f"docker ps | grep {container} | "
                                  f"wc -l", ssh_key=KEY_FILE).stdout_str
                if int(cont_count) == 1:
                    return True
                elif int(cont_count) > 1:
                    LOG.warning(f"Found {cont_count} ucp-cluster-agent containers "
                                f"on {node.name} - only 1 was expected.")
        nodes = self.cluster.get_machines()
        container = 'ucp-cluster-agent'
        KEY_FILE = settings.HA_TEST_PRIVATE_KEY_FILE
        for node in nodes:
            LOG.info(f"Access {node.name}")
            cont = node._run_cmd(f"docker ps | grep {container} | wc -l",
                                 ssh_key=KEY_FILE).stdout_str
            if int(cont) != 0:
                LOG.info(f"Container {container} found. Killing it now")
                node._run_cmd(f"id=`docker ps | grep {container} |"
                              f" awk {{'print $1'}}`;"
                              f"docker kill --signal=9 $id",
                              ssh_key=KEY_FILE)
                break
        else:
            raise Exception('ucp-cluster-agent was not found on any nodes')
        waiters.wait(lambda: check_if_cluster_agent_is_running(),
                     timeout=60, interval=10)
        self.check_k8s_pods(timeout=1200, interval=30)

    def check_ha_restart_docker_service(self):
        """Restart docker service on every cluster node (one at a time)
        Precondition - all expected pods and their replicas must be presented
        The following scenario is executed for every node

        Scenario:
            1. SSH to node
            2. Restart docker service
            3. Check docker services are OK
            4. Check that all pods are Running and Ready

        Expected result - all docker services are OK after
        docker was restarted.
        """
        dockerclient = self.cluster.get_dockerclient(private_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        nodes = self.cluster.get_machines()
        mkeclient = self.cluster.get_mke_dashboardclient()
        ucp_worker_agent_name = self.get_ucp_worker_agent_name()
        for node in nodes:
            LOG.info(f"Accessing {node.name}")
            LOG.info(f"restart docker service on node {node.name}")
            node.run_cmd("sudo systemctl restart docker.service", ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
            time.sleep(60)
            waiters.wait(
                lambda: restart_docker_and_check_status(node=node,
                                                        docker_client=dockerclient,
                                                        mke_client=mkeclient),
                timeout=360, interval=120,
                timeout_msg=f"Failed to restart Docker service on node {node.name} ")

            LOG.info("Waiting for k8s pods to be ready")
            waiters.wait_pass(
                lambda: self.k8sclient.pods.list_all(), timeout=120)
            LOG.info("Waiting for correct docker service replicas")
            # Check/wait for correct docker service replicas in cluster
            self.check_actual_expected_docker_services(
                changed_after_upd={'ucp-worker-agent-x': ucp_worker_agent_name})
            self.check_k8s_pods(timeout=1200, interval=30)

    def check_ha_haproxy_lb(self, ssh_user='mcc-user'):
        """Check that each node can balance traffic through HAProxy

        We have to check that all HAproxy can handle outage of any nodes/services

        Scenario:
            1. Select node to serve public ip. Do for each node
            2. Stop mcc-keepalived service on other nodes
            3. Check that API works well
            4. Pause k8s API docker service on other nodes
            5. Check that API is available
            6. Stop k8s API and mke API docker services to drop any connection to HAProxy
            7. Check that docker services were restored and k8s API is available
            8. Check that a pod can be scheduled on the current node
            9. Restore k8s API and keepalived on other nodes
            10. Check that API works well
        """
        if self.cluster.provider not in (
                utils.Provider.baremetal,
                utils.Provider.vsphere,
                utils.Provider.equinixmetal,
                utils.Provider.equinixmetalv2):
            msg = ("\nHA test to check HAProxy LB service were skipped "
                   "as cluster doesn't support the feature")
            LOG.info(msg)
            pytest.skip(msg)
            return

        cluster_release = self.cluster.clusterrelease_version
        LOG.info(f"Cluster release: {cluster_release}")

        ssh_key_file = settings.HA_TEST_PRIVATE_KEY_FILE
        nodes = self.cluster.get_machines(machine_type="control")
        try:
            LOG.info("Cluster has %s control nodes", [m.name for m in nodes])
            # 1. Select one control node
            for one in (nodes + [nodes[0]]):
                info = f"#  Check Haproxy on the node {one.name}  #"
                LOG.info(f"\n{'#' * len(info)}"
                         f"\n{info}"
                         f"\n{'#' * len(info)}")
                # Select other contronl nodes
                other_nodes = set(nodes) - set([one])
                # 2. Stop Keepalived on other nodes to move VIP to the selected control node
                LOG.info("Turning off Keepalived on %s", [m.name for m in other_nodes])
                for stop_one in other_nodes:
                    LOG.info("Stop keepalived service on %s", stop_one.name)
                    stop_one._run_cmd(
                        "sudo systemctl stop mcc-keepalived.service",
                        verbose=True,
                        ssh_key=ssh_key_file,
                        ssh_login=ssh_user,
                        reconnect=True)

                # 3. Check API
                self.cluster.check.check_k8s_nodes()
                self.cluster.check.check_actual_expected_pods()

                # 4. Turn off k8s API on other nodes
                LOG.info("Turning off backend k8s API services on %s", [m.name for m in other_nodes])
                for stop_one in other_nodes:
                    LOG.info("Pause k8s api service on %s", stop_one.name)
                    stop_one._run_cmd(
                        "sudo docker pause ucp-kube-apiserver",
                        verbose=True,
                        ssh_key=ssh_key_file,
                        ssh_login=ssh_user,
                        reconnect=True)

                LOG.info("Wait 10 seconds before HAProxy reacts to stop api services")
                time.sleep(10)

                # 5. Check API availability
                LOG.info("Getting list of nodes to check API availability")
                assert_nodes = self.cluster.get_machines(machine_type="control")

                assert set(n.name for n in nodes) == set(n.name for n in assert_nodes), \
                    ("Can't fetch list of nodes. It seems that haproxy can't handle "
                     f"service off on {other_nodes}")
                LOG.info(f"Response with machines: {[n.name for n in assert_nodes]}")

                # 6. Simulate haproxy <-> ucp-controller connection failure
                #    MKE containers must be restarted automatically in a few seconds
                one._run_cmd(
                    "sudo docker stop ucp-kube-apiserver; sudo docker stop ucp-controller",
                    verbose=True,
                    ssh_key=ssh_key_file,
                    ssh_login=ssh_user,
                    reconnect=True)

                # 7. Check that MKE containers were restarted and k8s API is available
                LOG.info(f"\n\nWait until k8s API becomes available again "
                         f"and VIP is returned on the machine {one.name}\n")
                waiters.wait_pass(
                    self.cluster.get_machines, timeout=600, interval=10,
                    expected=(ApiException, MaxRetryError, ProtocolError),
                    timeout_msg=f"MKE services were not automatically restarted on the machine {one.name}")

                LOG.info("\n\n*** VIP is restored ***\n")
                LOG.info("Get the list of machines to check API availability after stopping MKE services")
                assert_nodes = self.cluster.get_machines(machine_type="control")
                assert set(n.name for n in nodes) == set(n.name for n in assert_nodes), \
                    ("Can't fetch list of machines. It seems that haproxy can't handle "
                     f"service off on {other_nodes}")
                LOG.info(f"Response with machines: {[n.name for n in assert_nodes]}")

                # 8. Check that a pod can be scheduled on the current node
                #    Use exec_pod_cmd() to run a simple pod
                time.sleep(300)
                one.exec_pod_cmd("docker ps|grep ucp-kube-apiserver; sudo ctr tasks ls;", verbose=True)

                # 9. Restore k8s API and keepalived on all nodes
                LOG.info("Recover k8s api services on %s", [m.name for m in other_nodes])
                for start_one in other_nodes:
                    LOG.info("unpause k8s api service on %s", start_one.name)
                    start_one._run_cmd(
                        "sudo docker unpause ucp-kube-apiserver",
                        verbose=True,
                        ssh_key=ssh_key_file,
                        ssh_login=ssh_user,
                        reconnect=True)

                self.cluster.check.check_k8s_nodes()
                self.cluster.check.check_actual_expected_pods()

                LOG.info("Turning on Keepalived on %s", [m.name for m in other_nodes])
                for start_one in other_nodes:
                    LOG.info("Start Keepalived service on %s", start_one.name)
                    start_one._run_cmd(
                        "sudo systemctl start mcc-keepalived.service",
                        verbose=True,
                        ssh_key=ssh_key_file,
                        ssh_login=ssh_user,
                        reconnect=True)

                LOG.info("Wait 30 seconds before HAProxy reacts to start api services. "
                         "Should give time to restore clustered pods.")
                time.sleep(30)

                # 10. Check API
                self.cluster.check.check_k8s_nodes()
                self.cluster.check.check_actual_expected_pods()

                LOG.info("Finish checking HAProxy service on %s", one.name)
        finally:
            LOG.info("\n#########################################################"
                     "\n#  Restore k8s and keepalived services on all machines  #"
                     "\n#########################################################")
            for one in nodes:
                LOG.info(f"Unpause k8s service on the node {one.name}")
                one._run_cmd(
                    "sudo docker unpause ucp-kube-apiserver || true",
                    verbose=True,
                    ssh_key=ssh_key_file,
                    ssh_login=ssh_user,
                    reconnect=True)
                LOG.info(f"Start Keepalived service on the node {one.name}")
                one._run_cmd(
                    "sudo systemctl start mcc-keepalived.service || true",
                    verbose=True,
                    ssh_key=ssh_key_file,
                    ssh_login=ssh_user,
                    reconnect=True)
            self.cluster.check.check_k8s_nodes()
            self.cluster.check.check_actual_expected_pods()

    def check_ha_ceph_mgr(self):
        """Check that Ceph HA move active mgr to "stand by" node

        Scenario:
            1. Check ceph cluster state and readiness
            2. Check on which node we have active ceph mgr and on which it in stand by
            3. Reboot node with active ceph mgr
            4. Check that all pods are Running and Ready
            5. Check ceph cluster state and readiness
            6. Check that ceph mgrs swapped

        Expected result - Ceph active mgr should migrate to node on which it was
        in stand by state
        """
        if self.cluster.provider not in (
                utils.Provider.baremetal,
                utils.Provider.equinixmetalv2):
            msg = ("\nHA test to check HA  Ceph mgr service were skipped "
                   "as cluster doesn't support the feature")
            LOG.info(msg)
            pytest.skip(msg)

        def check_active_mgr_similar(cluster, ceph_tools_pod):
            if cluster.workaround.skip_kaascephcluster_usage():
                mgr_status = cluster.get_miracephhealth().data.get('status', {}) \
                    .get('fullClusterStatus', {}).get('daemonsStatus', {}).get('mgr', {}).get('status', '')
            else:
                mgr_status = cluster.get_cephcluster().data.get('status', {}) \
                    .get('fullClusterInfo', {}).get('daemonsStatus', {}).get('mgr', {}).get('status', '')
            if not mgr_status:
                LOG.warning('Ceph mgr status field is empty')
                return False
            active_mgr_idx = mgr_status.split(' ')[0]

            # Also check status in mgr pod and compare it
            cmd = ['/bin/sh', '-c', 'ceph mgr dump --format json']
            json_ret = ceph_tools_pod.exec(cmd)
            mgr_in_pod = yaml.safe_load(json_ret).get('active_name', '')
            if not mgr_in_pod:
                LOG.warning('mgr name in pos is empty')
                return False

            if mgr_in_pod == active_mgr_idx:
                LOG.info(f'Ceph mgr name in pod = "{mgr_in_pod}" is same as in kcc: "{active_mgr_idx}"')
                return True
            else:
                return False

        ceph_health_timeout = 2400
        rook_ns = settings.ROOK_CEPH_NS
        ssh_key_file = utils.load_keyfile(settings.HA_TEST_PRIVATE_KEY_FILE)
        pkey = utils.get_rsa_key(ssh_key_file['private'])
        ceph_tools_pod = self.cluster.get_ceph_tool_pod()
        #
        # 1. Check ceph cluster state and readiness
        LOG.info("Check ceph cluster status")
        try:
            health_info = self.cluster.check.get_ceph_health_detail()
            assert health_info['status'] == "HEALTH_OK", f'Health is not OK. Will not proceed. ' \
                                                         f'Current ceph health status: {health_info}'
        except AssertionError:
            self.cluster.check.wait_ceph_health_status(
                timeout=ceph_health_timeout, interval=30)

        self.cluster.check.check_cluster_readiness()
        self.cluster.check.check_ceph_pvc()
        LOG.info("Ceph cluster is healthy.")

        # 2. Check on which node we have active ceph mgr and on which it in stand by
        ceph_mgr_pods = self.k8sclient.pods.list(
            namespace=rook_ns,
            name_prefix='rook-ceph-mgr')
        # Compare mgr status in pod and in kcc
        waiters.wait(lambda: check_active_mgr_similar(self.cluster, ceph_tools_pod),
                     timeout=600,
                     interval=10,
                     timeout_msg='Expected that values will be similar in pod and in kcc')
        # Calculate active mgr node
        if self.cluster.workaround.skip_kaascephcluster_usage():
            ceph_mgr_status = self.cluster.get_miracephhealth().data.get('status', {})\
                .get('fullClusterStatus', {}).get('daemonsStatus', {}).get('mgr', {}).get('status')
        else:
            ceph_mgr_status = self.cluster.get_cephcluster().data.get('status', {})\
                .get('fullClusterInfo', {}).get('daemonsStatus', {}).get('mgr', {}).get('status')
        initial_active_mgr_idx = ceph_mgr_status.split(' ')[0]
        pod_with_active_mgr = [pod for pod in ceph_mgr_pods if f'rook-ceph-mgr-{initial_active_mgr_idx}' in pod.name][0]
        LOG.info(f'Current pod name with active Ceph mgr: {pod_with_active_mgr.name}')
        # Find node on which active mgr works
        node_with_active_mgr_name = pod_with_active_mgr.data.get('spec', {}).get('node_name', '')
        assert node_with_active_mgr_name, 'node_name field is empty'

        node_with_active_mgr_address = pod_with_active_mgr.data.get('status', {}).get('host_ip', '')
        assert node_with_active_mgr_address, 'host_ip field is empty'

        LOG.info(f'Node with active Ceph mgr pod: {node_with_active_mgr_name} '
                 f'with IP address: {node_with_active_mgr_address}')
        machine = [machine for machine in self.cluster.get_machines() if machine.get_k8s_node_name() ==
                   node_with_active_mgr_name][0]

        # 3. Reboot node with active ceph mgr
        LOG.info(f'Accessing {machine.name} node and rebooting it')
        auth = exec_helpers.SSHAuth(username='mcc-user',
                                    password='', key=pkey)
        ssh = exec_helpers.SSHClient(
            host=node_with_active_mgr_address, port=22, auth=auth)
        ssh.logger.addHandler(logger.console)
        ssh.sudo_mode = True
        ssh.reconnect()
        ssh.check_call("iptables -A INPUT -p tcp --dport 22 -j REJECT; "
                       "/sbin/shutdown -r -f now &>/dev/null & exit")
        LOG.info("Node is being rebooted.")
        waiters.wait_tcp(node_with_active_mgr_address, port=22, timeout=600)
        LOG.info("Node is rebooted succesfully. Establishing new connection...")
        ssh = exec_helpers.SSHClient(host=node_with_active_mgr_address, port=22, auth=auth)
        ssh.logger.addHandler(logger.console)

        # 4. Check that all pods are Running and Ready
        LOG.info("Check cluster nodes and pods")
        self.cluster.check.check_k8s_nodes()
        self.cluster.check.check_actual_expected_pods(timeout=1800)

        # 5. Check ceph cluster state and readiness
        LOG.info("Check ceph cluster status")
        try:
            health_info = self.cluster.check.get_ceph_health_detail()
            assert health_info['status'] == "HEALTH_OK", f'Health is not OK. Will not proceed. ' \
                                                         f'Current ceph health status: {health_info}'
        except AssertionError:
            self.cluster.check.wait_ceph_health_status(
                timeout=ceph_health_timeout, interval=30)
        self.cluster.check.check_cluster_readiness()
        self.cluster.check.check_ceph_pvc()
        LOG.info("Ceph cluster is healthy.")

        # 6. Check that ceph mgrs swapped
        # Compare mgr status in pod and in kcc
        # Need to update ceph_tool_pod
        ceph_tools_pod = self.cluster.get_ceph_tool_pod()
        waiters.wait(lambda: check_active_mgr_similar(self.cluster, ceph_tools_pod),
                     timeout=600,
                     interval=10,
                     timeout_msg='Expected that values will be similar in pod and in kcc')

        if self.cluster.workaround.skip_kaascephcluster_usage():
            ceph_mgr_status = self.cluster.get_miracephhealth().data.get('status', {}) \
                .get('fullClusterStatus', {}).get('daemonsStatus', {}).get('mgr', {}).get('status')
        else:
            ceph_mgr_status = self.cluster.get_cephcluster().data.get('status', {}) \
                .get('fullClusterInfo', {}).get('daemonsStatus', {}).get('mgr', {}).get('status')
        last_active_mgr_idx = ceph_mgr_status.split(' ')[0]
        assert initial_active_mgr_idx != last_active_mgr_idx, f'Initial active manager pod index: ' \
                                                              f'"{initial_active_mgr_idx}" is same with last ' \
                                                              f'active manager pod index: "{last_active_mgr_idx}'""
        LOG.info(f'HA test for Ceph mgr pod finished successfully!, initial pod was: "{initial_active_mgr_idx}"'
                 f'and new pod is: "{last_active_mgr_idx}"')

    def calculate_k8s_downtime(self, data: dict) -> list:
        """
        Calculate potential downtime period based on Prometheus data

        Args:
            data: PrometheusClient.get_svc_probe_duration_seconds object

        Returns: result list of downtimes

        """
        result = []
        start = {}
        stop = {}
        for item in data[0].get('values', []):
            if item[1] == "0":
                if not start:
                    start = item[0]
                else:
                    stop = item[0]
            else:
                if start and stop:
                    time_format = '%H:%M:%S'
                    start_data = datetime.fromtimestamp(start, tz=timezone.utc)
                    stop_data = datetime.fromtimestamp(stop, tz=timezone.utc)
                    duration = stop_data - start_data

                    result.append({
                        "start": start_data.strftime(time_format),
                        "stop": stop_data.strftime(time_format),
                        "duration": duration
                    })
                start = {}
                stop = {}
        return result

    def check_svc_downtime(self, service_name, expected_downtime, namespace=""):
        """
        Check downtime for k8s services
        Args:
            service_name: service name
            expected_downtime: expected downtime in seconds
            namespace: service namespace

        Returns: None

        """
        start = time.time()
        if self.cluster.sl_ha_enabled():
            expected_downtime = settings.HA_SL_SVC_HA_MODE_DOWNTIME
            LOG.info("SL HA enabled")
        else:
            LOG.info("SL HA disabled")

        svc_pods = self.k8sclient.pods.list_starts_with(service_name, namespace=namespace)
        target_pod = svc_pods[0]
        LOG.info(f"Delete {target_pod.name} pod")
        target_pod.delete()
        LOG.info(f"Wait until all {service_name} pod(s) Created and Running")
        self.check_k8s_pods(pods_prefix=service_name,
                            target_namespaces=namespace,
                            timeout=300,
                            interval=30)
        end = time.time()
        client = self.cluster.prometheusclient
        result = client.get_svc_probe_success(namespace=namespace, service_name=service_name, start=start, end=end)
        downtime = self.calculate_k8s_downtime(result)
        utils.check_downtime(downtime, expected_downtime)

    def check_helmbundles(self, expected_spec=None, timeout=1800, interval=60):
        def helmbundles_ready(expected_spec):
            try:
                hb = self.cluster.get_helmbundle().data
                expected_spec = expected_spec or hb['spec']['releases']
            except Exception as e:
                LOG.error(e)
                return False

            hb_status = hb.get('status') or {}
            release_status = hb_status.get('releaseStatuses', {})

            spec_chart_names = ["{0}:{1}".format(
                                x['name'],
                                helpers.get_chart_version(x))
                                for x in expected_spec]
            status_chart_names = ["{0}:{1}".format(k, v['version'])
                                  for k, v in release_status.items()]
            LOG.debug(f"Current status "
                      f"of the charts:\n{yaml.dump(release_status)}")

            # Check that all charts with expected versions
            # are present in the helmbundle status
            charts_diff = set(spec_chart_names)\
                .difference(set(status_chart_names))
            if charts_diff:
                LOG.error(f"Mismatch between expected and actual helmbundles. "
                          f"Expected: {spec_chart_names}, "
                          f"actual: {status_chart_names}, "
                          f"difference: {charts_diff}")
                return False

            # Check that no charts in the helmbundle status has False
            failed_charts = [v for v in release_status.values()
                             if not v['success']]
            if failed_charts:
                LOG.error(f"The following charts "
                          f"in the failed state:\n"
                          f"{yaml.dump(failed_charts)}")
                return False

            LOG.info(f"The following charts "
                     f"have been deployed "
                     f"in the cluster:\n{yaml.dump(status_chart_names)}")
            return True

        LOG.info("Check helmbundles status")
        try:
            waiters.wait(lambda: helmbundles_ready(expected_spec),
                         timeout=timeout, interval=interval)
        except exceptions.TimeoutError:
            helmbundle_status = self.cluster.get_helmbundle().data.get('status') or {}
            releaseStatuses = helmbundle_status.get('releaseStatuses', {})
            err = f"Not all helmbundles have been deployed: {releaseStatuses}"
            raise TimeoutError(err)
        LOG.info("All charts have been successfully deployed")

    def check_svc_access(self, remote):
        tmpl = templates.render_template(settings.SVC_CHECK_POD_YAML)
        LOG.debug(tmpl)
        json_body = json.dumps(yaml.load(tmpl, Loader=yaml.SafeLoader))
        LOG.info("Creating nginx pod")
        pod = self.cluster.k8sclient.pods.create(
            name="svc-check", namespace="default",
            body=json.loads(json_body)
        )
        pod.wait_phase('Running', timeout=1800, interval=10)
        LOG.info("Creating service")
        service = kubernetes.client.V1Service()
        service.metadata = kubernetes.client.V1ObjectMeta(
            name="svc-check-svc")
        service.spec = kubernetes.client.V1ServiceSpec(type="LoadBalancer")
        service.spec.selector = {"run": "svc-check"}
        service.spec.ports = [
            kubernetes.client.V1ServicePort(port=80, target_port=80)
        ]
        svc = self.cluster.k8sclient.services.create(
            name="svc-check-svc", namespace='default', body=service)
        LOG.info("Waiting for LB to have external ip/hostname")
        waiters.wait(lambda: svc.read().to_dict()['status'][
            'load_balancer']['ingress'] is not None,
            timeout=500, interval=15,
            timeout_msg='Timeout waiting for svc loadbalancer')
        ingress_addr = \
            svc.read().to_dict()['status']['load_balancer']['ingress'][0]
        if ingress_addr['ip']:
            address = ingress_addr['ip']
        else:
            address = ingress_addr['hostname'] or ''
        LOG.info(f"LB iaddress is {address}")

        if settings.KAAS_OFFLINE_DEPLOYMENT:
            command = f'wget -O- --no-proxy {address}'
        else:
            command = f'wget -O- {address}'

        retries = 3
        ret = remote.check_call(command, raise_on_err=False)
        while ret.exit_code != 0 and retries > 0:
            LOG.warning("Service is not ready yet, retrying in 5 sec ...")
            time.sleep(5)
            ret = remote.check_call(command, raise_on_err=False)
            retries -= 1

        if ret.exit_code == 0 and "Welcome to nginx" in ret.stdout_str:
            LOG.info(f"Service is ready. Retries left: {retries}")
            pod.delete()
            svc.delete()
        else:
            raise Exception(f"Service '{svc.name}' is not available. "
                            f"Service IP '{address}', target pod '{pod.name}'. "
                            f"Response {ret.stdout_str}")

    def check_resource_discovery(self):
        if not self.cluster.is_management and not self.cluster.is_regional:
            LOG.warning("check_resource_discovery() is not suitable "
                        "for child clusters")
            return None
        if self.cluster.provider == utils.Provider.aws:
            LOG.info("Checking aws resource discovery")
            kaas_awsresources = \
                self.cluster.k8sclient.kaas_awsresources.list(
                    namespace=self.cluster.namespace)
            if self.cluster.is_management:
                awsresources = [x for x in kaas_awsresources
                                if x.name == 'cloud-config'][0]
            else:
                awsresources = [x for x in kaas_awsresources
                                if x.name == self.cluster.name][0]
            aws_region = self.cluster.data['spec']['providerSpec'][
                'value']['region']
            regions = awsresources.regions
            LOG.debug(f"AWS regions: {regions}")
            instance_types = awsresources.instance_types(aws_region)
            LOG.debug(f"AWS instance types: {instance_types}")
            images = awsresources.images(aws_region)
            LOG.debug(f"AWS images: {images}")
            assert regions, \
                f"No regions were found in " \
                f"{awsresources.name} kaas_awsresources crd object"
            assert instance_types, \
                f"No instance_types were found in " \
                f"{awsresources.name} kaas_awsresources crd object"
            assert images, \
                f"No images were found in " \
                f"{awsresources.name} kaas_awsresources crd object"
        else:
            LOG.info("Skipping check_resource_discovery")

    def check_machines_status(self, expected_status='Ready',
                              timeout=settings.SI_CHECK_MACHINES_STATUS_TIMEOUT, interval=30):
        if isinstance(expected_status, str):
            expected_status = [expected_status]
        elif not isinstance(expected_status, list):
            raise TypeError("check_machines_status() parameter 'expected_status' must be <str> or <list>")

        def machines_status(expected_status):
            machines = self.cluster.get_machines()
            LOG.debug(f"Got machines: {machines}")
            machines_in_status = [x for x in machines
                                  if x.machine_status in expected_status]
            LOG.debug(f"Machines in {expected_status} "
                      f"status: {machines_in_status}")
            self.show_machines_conditions()
            if len(machines) != len(machines_in_status):
                not_disabled_machines = []
                for machine in [x for x in machines
                                if x not in machines_in_status]:
                    LOG.info(f"{machine.name} machine is "
                             f"in status {machine.machine_status}")
                    if machine.machine_status != 'Disabled':
                        not_disabled_machines.append(machine.name)
                if len(not_disabled_machines) > 0:
                    LOG.info(f"===Not all machines are "
                             f"in status {expected_status}===")
                    raise Exception(f"Not all machines are "
                                    f"in status {expected_status}")
            return True

        LOG.info(f"Wait {timeout} seconds until machines will be in "
                 f"the status <{expected_status}>")
        waiters.wait_pass(lambda: machines_status(expected_status),
                          timeout=timeout, interval=interval,
                          expected=(Exception, ApiException))
        LOG.info("All machines are in expected status")

    def check_machine_update_group_labels(self, group_name='default', machine_type='worker'):
        machines = self.cluster.get_machines(machine_type=machine_type)
        LOG.info(f"Got machines: {machines}")
        for m in machines:
            m_labels = m.data.get('metadata').get('labels')
            LOG.debug(f"Metadata on machine : {m_labels}")
            group_label = f"{self.cluster.name}-{group_name}"
            LOG.info(f"Looking for label: {group_label}")
            assert group_label in m_labels.get(
                'kaas.mirantis.com/update-group'), f"Missed group label {group_label} on " \
                                                   f"{m.name}, current labels {m_labels}"

    def check_any_machine_exp_status(self, expected_status='Ready',
                                     timeout=1800, interval=30):
        def machines_status(exp_status):
            machines = self.cluster.get_machines()
            LOG.debug(f"Got machines: {machines}")
            machines_in_status = [x.name for x in machines
                                  if x.machine_status == exp_status]
            LOG.info(f"Machines in {exp_status} "
                     f"status: {machines_in_status}")
            if not machines_in_status:
                raise Exception(f"There are not any machines "
                                f"in status {exp_status}")
            return True

        LOG.info(f"Wait {timeout} seconds until any of machines will be in "
                 f"the status <{expected_status}>")
        waiters.wait_pass(lambda: machines_status(expected_status),
                          timeout=timeout, interval=interval,
                          expected=(Exception, ApiException))
        LOG.info("There is machine in expected status")

    def check_kubeconfig_stored_in_userdata(self, one_machine_check=True,
                                            machines_list=None):
        # Since 2.12 kubeconfig stored in user-data should not be valid

        all_machines = machines_list or self.cluster.get_machines()
        machines = []
        if one_machine_check:
            machines.append(random.choice(all_machines))
        else:
            machines = all_machines
        for machine in machines:
            result = machine.exec_pod_cmd(
                "sudo cat /var/lib/cloud/instance/user-data.txt",
                verbose=False)['logs']
            userdata = yaml.safe_load(result)
            for u in userdata['write_files']:
                if u['path'] == '/etc/lcm/kubeconfig':
                    kubeconfig = u['content']
                    with open(f'kubeconfig-{machine.name}', 'w') as f:
                        f.write(kubeconfig)
            read_machine_name = ""
            # Init a new Manager instance with the specified kubeconfig file
            kmanager = self.cluster._manager.__class__(kubeconfig=f'kubeconfig-{machine.name}')

            def retry_get_machine():
                nonlocal read_machine_name
                try:
                    read_machine_name = kmanager.get_lcmmachine(
                        name=machine.name, namespace=self.cluster.namespace
                    ).read().to_dict()['metadata']['name']
                except ApiException as e:
                    if e.reason == 'Unauthorized':
                        LOG.info(f"Machine {machine.name} has not access to its "
                                 "own lcmmachine in mgmt cluster as expected")
                        read_machine_name = ""
                        return True
                return False

            # we wait for 7 minutes to account for the following situation:
            # in case the chosen node has just been deployed, most likely
            # agent has just picked up a new token created by agent-controller,
            # on the next agent poll (in 1 minute) it restarts (startup should
            # not take long but lets account for 1 more minute) and updates
            # token currently in use in lcmmachine status, which will be picked
            # up by agent controller (the worst case) in 5 mins, so 7 mins total
            waiters.wait(retry_get_machine, timeout=420, interval=30)
            if read_machine_name == machine.name:
                raise Exception("kubeconfig taken from the user-data of "
                                f"node {machine.name} should not have access "
                                "to the mgmt cluster (its own lcmmachine)")

    def check_machines_access_with_given_key(self,
                                             machines_list=None,
                                             verbose=False,
                                             ssh_key_file=None,
                                             ssh_login='mcc-user'):
        machines = machines_list if machines_list \
            else self.cluster.get_machines()
        ssh_key_file = ssh_key_file \
            if ssh_key_file else self.cluster.private_key
        machines_names = [m.name for m in machines]
        not_ready_machines = []
        for machine in machines:
            try:
                machine._run_cmd("hostname", verbose=verbose,
                                 ssh_key=ssh_key_file,
                                 ssh_login=ssh_login,
                                 reconnect=True)
            except Exception as e:
                LOG.debug("Can't connect to {}: {}".format(
                    machine.name, e))
                not_ready_machines.append(machine.name)
        if not_ready_machines:
            LOG.warning("No access to next machines: {} "
                        "using key {}".format(
                            not_ready_machines, ssh_key_file))
            return False
        LOG.info("Succesfully connected to all machines: {} "
                 "using key {}".format(
                     machines_names, ssh_key_file))
        return True

    def show_updateplan_steps_status(self, updateplan_steps_status, step_key, steps_id=None):
        headers = ["   ", "Step", "Duration", "Status", "Message"]
        status_data = [["-->" if data.get(step_key) in steps_id else "",
                        data.get(step_key, "-"),
                        data.get("duration", "-"),
                        data.get("status", "-"),
                        data.get("message", "-")]
                       for data in updateplan_steps_status]
        # Show Machines status and not ready conditions
        status_msg = tabulate(status_data, tablefmt="presto", headers=headers)
        LOG.info(f"\n{status_msg}\n")

    def show_machines_conditions(self):
        # Get Machines data
        headers = ["Machine", "LCM Phase", "Ready", "Distribution", "Runtime", "Conditions in progress"]
        machines_data = self.collect_machines_data()
        status_data = [[data["name"],
                        data["lcm_phase"],
                        data["ready"],
                        data["distribution"],
                        data["runtime"],
                        data["conditions"]["not_ready"]]
                       for data in machines_data]
        # Show Machines status and not ready conditions
        status_msg = tabulate(status_data, tablefmt="presto", headers=headers)
        LOG.info(f"\n{status_msg}\n")
        return machines_data

    def _check_machines_conditions(self, expected_status, check_managed_clusters=False,
                                   expected_stuck_machine_names=None):
        """Check the expected status for the cluster Machines

        Show the current conditions for the cluster Machines
        """
        machines_status_ok = True
        lcmmachines_status_ok = True

        if check_managed_clusters:
            # mgmt cluster has access to all machines,
            # so we check their status
            if self.cluster.is_management:
                machines = self.k8sclient.kaas_machines.list_raw()
                for machine in machines.to_dict()['items']:
                    m_name = machine['metadata']['name']
                    m_ns = machine['metadata']['namespace']
                    m_labels = machine['metadata'].get('labels', {})
                    cl_name = None if not m_labels else \
                        m_labels.get('cluster.sigs.k8s.io/cluster-name', None)
                    m_ps = machine['status'].get('providerStatus', {})
                    m_status = None if not m_ps else m_ps.get('status', None)
                    # we check that regional and mgmt machines are in
                    # expected_status otherwise we continue to wait
                    # CLUSTER_NAMESPACE is usually default and we expect
                    # regional cluster in this ns as well
                    if m_ns == settings.CLUSTER_NAMESPACE \
                            and m_status != expected_status:
                        LOG.info(f"Found {m_name} machine of {cl_name} "
                                 f"cluster in {m_status} status")
                        machines_status_ok = False
                    # we check that all machines, except mgmt
                    # and regional ones are in expected_status
                    if m_ns != settings.CLUSTER_NAMESPACE \
                            and m_status != expected_status:
                        LOG.error(f"Machine {m_name} of {cl_name} cluster "
                                  f"has status {m_status} while "
                                  f"Ready is expected")
                        raise Exception(f"Machine {m_name} has status "
                                        f"{m_status}")

            # mgmt & regional clusters have access to regional lcmmachines,
            # so we check their status here
            lcmmachines = \
                self.cluster.get_all_lcmmachines(
                    states=["Upgrade", "Pending", "Deploy", "Prepare"]
                )
            lcmm_not_ready = {}
            for lcmm in lcmmachines:
                # if lcmm.data.get('spec', {}).get('disable', False):  # Should not have such status
                #    continue
                if lcmm.namespace != self.cluster.namespace:
                    lcmm_not_ready[lcmm.name] = {'ns': lcmm.namespace}
                else:
                    LOG.info(f"{lcmm.name} lcmmachine of current "
                             f"cluster is not Ready")
                    lcmmachines_status_ok = False
            if lcmm_not_ready:
                LOG.error(f"These managed cluster lcmmachines "
                          f"are not ready: {lcmm_not_ready}")
                raise Exception(f"These regional or child lcmachines "
                                f"are not ready: {lcmm_not_ready}")
        else:
            # Show machines conditions for the current cluster and get machines_data
            machines_data = self.show_machines_conditions()
            # If 'expected_stuck_machine_names' is not empty, then immediately stop waiting if
            # any of 'expected_stuck_machine_names' is stuck
            lcm_stuck_names = [m['name'] for m in machines_data
                               if m["lcm_stuck"] and
                               m['name'] in expected_stuck_machine_names]
            if lcm_stuck_names:
                raise exceptions.LCMStuckException(
                    f"LCM operations stuck on the following Machines: {lcm_stuck_names}")

            # Do not count Machines with disabled LCM operatios
            filtered_machines_data = [m for m in machines_data if not m["disabled"]]

            # Check Machine readiness and LCM phases
            lcmmachines_status_ok = all([m["ready"] for m in filtered_machines_data])
            machines_status_ok = all([m["lcm_phase"] == "Ready" for m in filtered_machines_data])

        if not machines_status_ok or not lcmmachines_status_ok:
            raise RuntimeError(f"Not all machines or lcmmachines "
                               f"are in status {expected_status}")
        return True

    def check_update_finished(self, timeout=9300, interval=15,
                              check_managed_clusters=False, expected_stuck_machine_names=None):
        expected_status = "Ready"
        # Expect some Machines with stuck LCMMachine tasks for destructive tests
        expected_stuck_machine_names = expected_stuck_machine_names or []

        def update_check(expected_status):
            clusterrelease_status_ok = True
            previous_clusterrelease = None
            status = self.cluster.data['status'] or {}
            cluster_provider_status = status.get('providerStatus', {})
            previous_clusterrelease = cluster_provider_status.get('releaseRefs', {}).get('previous', {}).get('name')
            spec_release = \
                self.cluster.spec['providerSpec']['value']['release']
            LOG.debug(f"Expected clusterrelease - {spec_release}, "
                      f"status.providerStatus.releaseRefs.previous.name - "
                      f"{previous_clusterrelease}")
            if spec_release != previous_clusterrelease:
                LOG.info(f"Expected clusterrelease - {spec_release}, "
                         f"actual clusterrelease - "
                         f"{previous_clusterrelease}. "
                         "Update is not finished yet..")
                clusterrelease_status_ok = False

            self._check_machines_conditions(
                expected_status, check_managed_clusters, expected_stuck_machine_names)

            if not clusterrelease_status_ok:
                raise RuntimeError("Clusterrelease version of cluster hasn't "
                                   "been updated")
            return True

        LOG.info(f"Waiting {timeout} sec until cluster is fully updated")
        waiters.wait_pass(lambda: update_check(expected_status),
                          timeout=timeout, interval=interval,
                          expected=(RuntimeError, ApiException,
                                    KeyError, TypeError, MaxRetryError))
        # we need to refresh cached expected pods/objects
        # lists in cluster object
        self.cluster._refresh_expected_objects()
        LOG.info("Update finished")

    def collect_machines_data(self):
        machines = self.cluster.get_machines()
        machines_data = []
        for machine in machines:
            machine_data = machine.data
            machine_status = machine_data.get("status") or {}
            provider_status = machine_status.get("providerStatus", {})

            machine_ready = provider_status.get("ready")
            machine_lcm_phase = machine_status.get("phase")
            machine_conditions = self.cluster.get_conditions_status(machine_data, skip_reboot_conditions=False)
            machine_distribution = provider_status.get("currentDistribution")
            machine_disabled = machine.is_disabled()

            lcm_stuck = machine.is_lcmmachine_stuck()
            # cover case when machine exist but node not yet populated
            try:
                machine_runtime = machine.runtime
            except:  # noqa
                machine_runtime = None

            # Collect data for status message
            machines_data.append({
                "name": machine.name,
                "lcm_phase": machine_lcm_phase + (" (stuck)" if lcm_stuck else ""),
                "ready": machine_ready,
                "distribution": machine_distribution,
                "conditions": machine_conditions,
                "disabled": machine_disabled,
                "lcm_stuck": lcm_stuck,
                "runtime": machine_runtime if machine_runtime else "",
            })
        return machines_data

    def wait_distribution_upgrade_finished(self, timeout=10800, interval=120):
        """Wait until Cluster readiness, Machines readiness and LCMMachines phases are ready

        Show Machines status and conditions during waiting.
        """
        def check_all_readiness():
            # Get releases from mgmt and region clusters with lcmmachine status
            try:
                cluster_data = self.cluster.data
                cluster_provider_status = (cluster_data.get("status") or {}).get("providerStatus", {})
                clusterrelease_version = self.cluster.clusterrelease_version
                cluster_ready = cluster_provider_status.get("ready")
                cluster_conditions = self.cluster.get_conditions_status(cluster_data)

                headers = ["Machine", "LCM Phase", "Ready", "Distribution", "Conditions in progress"]
                machines_data = self.collect_machines_data()
                status_data = [[data["name"],
                                data["lcm_phase"],
                                data["ready"],
                                data["distribution"],
                                data["conditions"]["not_ready"]]
                               for data in machines_data]

                status_msg = tabulate(status_data, tablefmt="presto", headers=headers)
                LOG.info(f"ClusterRelease: '{clusterrelease_version}'  "
                         f"Ready: '{cluster_ready}'  Conditions in progress: {cluster_conditions['not_ready']}\n"
                         f"{status_msg}\n")

                # Check that cluster is ready, machines are ready and lcmmachines are "Ready"
                if cluster_ready and all([m["ready"] and m["lcm_phase"] == "Ready" for m in machines_data]):
                    return True
                return False
            except (ApiException, MaxRetryError, ProtocolError):
                return False

        waiters.wait(check_all_readiness,
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=f"Cluster not became ready in {timeout} sec")

    def check_persistent_volumes_mounts(self, timeout=900, interval=10):
        def check_volumes_mounts():
            if settings.SKIP_CHECK_CHECK_PERSISTENT_VOLUMES_MOUNTS:
                LOG.warning('check_persistent_volumes_mounts: skip')
                return True
            LOG.info("Checking local-volume-provisioner mounts")
            hb = self.cluster.get_helmbundle().data
            releases = hb['spec']['releases']
            # NOTE(alexz): lvp has been migrated to kaas/core(local-volume-provisioner)
            # from bm(kaas-bm/local-volume-provisioner) long time ago
            chart_name = [
                x['name'] for x in releases
                if x['name'] in ['kaas-bm/local-volume-provisioner', 'local-volume-provisioner']
            ]
            mounts = [x['values']['classes'] for x in releases
                      if x['name'] in ['kaas-bm/local-volume-provisioner', 'local-volume-provisioner']]
            # Check if the chart present (for kaas < 1.13.0(2.0.0))
            if not chart_name:
                LOG.warning("LVP chart is not used for current release!")
                return True
            else:
                assert mounts, "Missing mounts!"
                LOG.info("Expected mounts on the nodes:")
                mount_paths = []
                for mount in mounts[0]:
                    mount_paths.append(mount['hostDir'])
                    LOG.info(mount['hostDir'])
                if not mount_paths:
                    LOG.warning("Local volumes mounts are not described"
                                "(or not used) in hb! Skip checking")
                else:
                    lvp_pods_list = \
                        self.cluster.k8sclient.pods.list_starts_with(
                            pattern=chart_name[0])
                    missing_mounts = []
                    for pod in lvp_pods_list:
                        LOG.info(f"Check mounts from pod: {pod.name}")
                        for path in mount_paths:
                            LOG.info(f"Mount: '{path}'")
                            if not pod.check_folder_exists(dir_path=path):
                                missing_mounts.append(path)
                        if missing_mounts:
                            raise FileExistsError(
                                f"Mount {missing_mounts} not found "
                                f"on the host!")
                return True

        waiters.wait_pass(lambda: check_volumes_mounts(),
                          timeout=timeout, interval=interval,
                          expected=(MaxRetryError, ApiException))

    def check_expected_phases_on_machines(self, machines, time_start, phase):
        results = []
        time_start_formated = datetime.strptime(time_start[:-1],
                                                "%Y-%m-%d %H:%M:%S").astimezone(
            pytz.UTC)
        machine_lcmmachines_timestamps = self.cluster.get_cluster_lcmmachines_timestamps()
        for machine in machines:
            phase_start_time = machine_lcmmachines_timestamps.get(
                machine.name, {}).get('phases', {}) \
                .get(phase, {}).get('startedAt', None)
            if not phase_start_time:
                LOG.info(
                    f"{phase} missing on {machine.name} machine")
                return [False]
            phase_start_time_formated = datetime.strptime(
                phase_start_time, "%Y-%m-%dT%H:%M:%SZ").astimezone(
                pytz.UTC)
            if time_start_formated > phase_start_time_formated:
                results.append(False)
                LOG.info(
                    f"Machine {machine.name} started {phase} phase at {phase_start_time_formated} " +
                    f"before update time ({time_start_formated}). " +
                    f"Waiting for {phase} time to be after the updating time...")
            else:
                LOG.info(
                    f"Machine {machine.name} started {phase} after updating time"
                )
                results.append(True)
        # if all values in the result = True,
        # it means that all machines entered the selected phase after the specified time
        LOG.debug(f"Map with bool list: {results}")
        if all(results):
            LOG.info(f"All machines entered the {phase} after {time_start}")
        return results

    def wait_machine_status_by_name(self, machine_name, expected_status,
                                    timeout=1800, interval=30):
        def machine_status(name):
            machine = self.cluster.get_machine(name)
            if not machine:
                LOG.warning(f"Machine {name} not found")
                return None
            status = machine.machine_status
            LOG.info(f"Machine {name} has <{status}> status")
            return status

        LOG.info(f"Wait {timeout} seconds until machine {machine_name} "
                 f"will be in the status <{expected_status}>")
        waiters.wait(lambda: machine_status(machine_name) == expected_status,
                     timeout=timeout, interval=interval)
        LOG.info(f"{machine_name} is in expected status {expected_status}")

    def check_boot_from_volume_by_machine_name(self, machine_name,
                                               openstack_client,
                                               boot_volume_size=80):
        # Check that bootFromVolume option still in machine CRD
        machine = self.cluster.get_machine(machine_name)
        current_boot_from_volume = machine.data.get('spec').get('providerSpec').get('value').get(
            'bootFromVolume', {}).get('enabled')
        assert current_boot_from_volume is True, f"Provided boot_from_volume: True flag " \
                                                 f"is not the same as in machine CRD: " \
                                                 f"{current_boot_from_volume}"

        current_boot_volume_size = machine.data.get('spec').get('providerSpec').get('value').get(
            'bootFromVolume', {}).get('volumeSize')
        assert boot_volume_size == current_boot_volume_size, f"Provided boot_volume_size: {boot_volume_size} " \
                                                             f"is not same as in machine CRD: " \
                                                             f"{current_boot_volume_size}"

        # Check os image from openstack client, if bootFromVolume worked correctly,
        # it should be empty
        openstack_machine_name = self.cluster.get_machine_openstack_name(machine_name)
        openstack_machine = openstack_client.get_server_by_name(openstack_machine_name)
        if openstack_machine.image:
            raise Exception(f"Image field for machine: {machine_name} is NOT empty, but should be, "
                            f"and contains next image(s): {openstack_machine.image}")

        LOG.info(f"All checks on machine: {machine_name} for bootFromVolume option done successfully, "
                 f"CRD is correct, openstack image is empty")

    def check_lcmmachine_releases(
            self, machine_type, expected_release, timeout=1800, interval=60):
        lcmmachines = self.cluster.get_cluster_lcmmachines_by_type(
            self.cluster.name, self.cluster.namespace, machine_type
        )

        def wait_func():
            for lcmm in lcmmachines:
                if lcmm.data.get("spec", {}).get(
                    "release") == lcmm.data.get("status", {}).get(
                        "release") == expected_release:
                    continue
                return False
            return True

        waiters.wait(wait_func, timeout=timeout, interval=interval)

    def check_cluster_release_in_spec(
            self, expected_clusterrelease, timeout=300, interval=10):
        if expected_clusterrelease:
            LOG.info(f"Checking cluster spec clusterrelease version. "
                     f"Expected: {expected_clusterrelease}")
            try:
                ec = expected_clusterrelease
                waiters.wait(
                    lambda: self.cluster.cluster_spec_release_version == ec,
                    timeout=timeout,
                    interval=interval
                )
            except exceptions.TimeoutError:
                raise TimeoutError(
                    f"Clusterrelease version mismatch. "
                    f"Actual: {self.cluster.cluster_spec_release_version}, "
                    f"expected: {expected_clusterrelease}")
            LOG.info("Clusterrelease version is correct")

    def check_cluster_release(self, expected_clusterrelease,
                              timeout=1000, interval=30):

        if expected_clusterrelease:
            LOG.info(f"Checking cluster clusterrelease version. "
                     f"Expected: {expected_clusterrelease}")
            try:
                ec = expected_clusterrelease
                waiters.wait(
                    lambda: self.cluster.clusterrelease_version == ec,
                    timeout=timeout,
                    interval=interval
                )
            except exceptions.TimeoutError:
                raise TimeoutError(
                    f"Clusterrelease version mismatch. "
                    f"Actual: {self.cluster.clusterrelease_version}, "
                    f"expected: {expected_clusterrelease}")
            LOG.info("Clusterrelease version is correct")

    def check_actual_expected_pods(self,
                                   expected_pods=None,
                                   timeout=settings.CHECK_ACTUAL_EXPECTED_PODS_TIMEOUT,
                                   interval=10,
                                   exclude_pods=None,
                                   exclude_jobs=True,
                                   exclude_removed_nodes=True,
                                   exclude_incorrect_affinity=True,
                                   exclude_node_shutdown=True,
                                   check_all_nss=False):
        """Compare expected list of pods (which is fetched automatically,
           unless explicitly provided) with actual list of pods in this
           cluster. Comparison is conducted for all namespaces by default"""

        if settings.SKIP_CHILD_EXPECTED_POD_CHECK:
            LOG.info("Skipping expected pods checking")
            return

        LOG.info("Checking that all pods and their replicas are in place")

        def compare(expected_pods_dict,
                    exclude_jobs):
            failed = {}
            namespaces = expected_pods_dict.keys()
            actual_list = [pod for pod in
                           self.k8sclient.pods.list_raw().to_dict()['items']]

            if exclude_jobs:
                before_filtering = \
                    set([x['metadata']['name'] for x in actual_list])
                actual_list = \
                    [x for x in actual_list if not (
                        x['metadata']['owner_references'] and
                        x['metadata'][
                            'owner_references'][0]['kind'] == 'Job')]
                after_filtering = \
                    set([x['metadata']['name'] for x in actual_list])
                LOG.debug(f"These pods are jobs and will be filtered out: "
                          f"{before_filtering - after_filtering}")

            if exclude_removed_nodes:
                # After cluster scale down, some pods may be still mapped on the
                # removed nodes. Garbage collector may clean these pods in few hours
                # but we can ignore such pods here
                node_names = [node.name for node in self.k8sclient.nodes.list_all()]
                before_filtering = set([x['metadata']['name'] for x in actual_list])

                new_actual_list = []
                filtered_nodes = set()
                for pod in actual_list:
                    pod_name = f"{pod['metadata']['namespace']}/{pod['metadata']['name']}"

                    node_name = pod['spec']['node_name']
                    if node_name is not None and node_name not in node_names:
                        LOG.warning(f"Pod {pod_name} node name is bound to non-existing node {node_name}, "
                                    f"skip check the pod")
                        filtered_nodes.add(node_name)
                        continue

                    node_selector = pod['spec']['node_selector'] or {}
                    node_selector_hostname = node_selector.get('kubernetes.io/hostname')
                    if node_selector_hostname is not None and node_selector_hostname not in node_names:
                        LOG.warning(f"Pod {pod_name} node selector is bound to non-existing hostname "
                                    f"{node_selector_hostname}, skip check the pod")
                        filtered_nodes.add(node_selector_hostname)
                        continue

                    new_actual_list.append(pod)
                actual_list = new_actual_list
                filtered_nodes = list(filtered_nodes)

                after_filtering = set([x['metadata']['name'] for x in actual_list])

                if before_filtering - after_filtering:
                    LOG.warning(f"These pods will be ignored because assigned to non-existing nodes "
                                f"{filtered_nodes} : {before_filtering - after_filtering}")

            if exclude_incorrect_affinity:
                # After changing node labels, some pods may no longer match the node affinity
                before_filtering = set([x['metadata']['name'] for x in actual_list])

                actual_list = [x for x in actual_list
                               if not (x['status'].get('phase', '') == 'Failed'
                                       and x['status'].get('reason') == 'NodeAffinity')]

                after_filtering = set([x['metadata']['name'] for x in actual_list])

                if before_filtering - after_filtering:
                    LOG.warning(f"These pods will be ignored because of wrong NodeAffinity: "
                                f"{before_filtering - after_filtering}")

            if exclude_node_shutdown:
                # If a reboot is a part of the release, some pods are terminated
                # by kubelet as part of graceful shutdown
                before_filtering = set([x['metadata']['name'] for x in actual_list])

                filtered = []

                for pod in actual_list:
                    conditions = pod['status'].get('conditions') or []
                    phase = pod['status'].get('phase')
                    reason = pod['status'].get('reason')
                    message = pod['status'].get('message', '')

                    has_bad_condition = any(
                        cond.get('type') == 'DisruptionTarget' and
                        cond.get('reason') == 'TerminationByKubelet' and
                        cond.get('status') == 'True' and
                        'node shutdown' in cond.get('message', '')
                        for cond in conditions
                    )

                    is_failed_by_node_shutdown = (
                        phase == 'Failed' and
                        reason == 'NodeShutdown' and
                        'shutting down' in message
                    )

                    if not has_bad_condition and not is_failed_by_node_shutdown:
                        filtered.append(pod)

                actual_list = filtered
                after_filtering = set([x['metadata']['name'] for x in actual_list])

                if before_filtering - after_filtering:
                    LOG.warning(f"These pods will be ignored because of graceful node shutdowns: "
                                f"{before_filtering - after_filtering}")

            # constant
            excluded_pods = ['elasticsearch-curator',
                             # SI pods
                             'ui-e2e-test', 'iam-bdd', 'conformance',
                             'test-stacklight', 'kaas-os-provider-e2e',
                             # to be checked
                             'patroni-lbauth',
                             'patroni-12-endpoint-cleanup',
                             'neutron-netns-cleanup-cron-default',
                             'ucp-secureoverlay',
                             'tf-test-tungsten-pytest',  # exist after TF test
                             # checked as a part of tungsten_fabric.py::deploy_tungsten_fabric
                             'tf-vrouter-provisioner',
                             'tf-vrouter-agent',
                             'tf-vrouter-vgw',
                             'tf-tool'
                             ]
            excluded_pods += self.EXCLUDED_PODS
            # special exclusions
            for excl_pod in excluded_pods:
                actual_list = \
                    [x for x in actual_list
                     if not x['metadata']['name'].startswith(excl_pod)]

            other_ns_pods = [pod for pod in actual_list
                             if pod['metadata']['namespace'] not in namespaces]
            actual_list = [pod for pod in actual_list
                           if pod['metadata']['namespace'] in namespaces]

            not_checked = [pod['metadata']['name'] for pod in actual_list]

            # sort names of expected pods so that
            # long names come first. "/no_owner" tag is excluded
            for ns in namespaces:
                expected_pods_lst = sorted(
                    list(expected_pods_dict[ns].keys()),
                    key=lambda x: len(x.split("/")[0]), reverse=True
                )
                for pod in expected_pods_lst:
                    desired_num = expected_pods_dict[ns][pod]
                    prefix = pod.split("/")[0]
                    compare_list = \
                        [pod for pod in actual_list
                         if pod['metadata']['name'].startswith(prefix) and
                         pod['metadata']['name'] in not_checked and
                         pod['metadata']['namespace'] == ns]

                    if not compare_list:
                        failed[prefix] = {"actual": 0,
                                          "desired/expected": desired_num}
                        continue

                    if '/no_owner' in pod:
                        # for pods that are marked "no_owner" we do not fetch
                        # owner and use # of pods from the file
                        LOG.debug(f"Number of pods "
                                  f"for {prefix} group will be checked "
                                  f"according to expected pods list")
                    else:
                        # get owner kind and name for the first pod in list
                        first_pod = compare_list[0]
                        owner_references = \
                            first_pod['metadata']['owner_references']
                        kind_name = f"{owner_references[0]['kind']}/" \
                                    f"{owner_references[0]['name']}"

                        if kind_name.split('/')[0] == "Node":
                            # Starting from 1.17 mirror pods have
                            # Node/node-name in owner_references
                            LOG.warning(f"No replica count info for pod "
                                        f"{first_pod['metadata']['name']}. "
                                        f"Using expected number ({desired_num}) "
                                        f"from the list")
                            # Better to handle this type of pods as '/no_owner'
                            LOG.info("Please add '/no_owner' for this group of"
                                     " pods to expected pods list file")
                        else:
                            try:
                                replicas_num = self.__get_replica_number(
                                    kind_name,
                                    first_pod['metadata']['namespace']
                                )
                                LOG.debug(f"First pod is "
                                          f"{first_pod['metadata']['name']}, "
                                          f"owner: {kind_name}, "
                                          f"replica #: {replicas_num}")
                                if int(replicas_num) != int(desired_num):
                                    if kind_name.startswith("DaemonSet"):
                                        LOG.warning(f"Replicas num ({replicas_num}) from {kind_name} "
                                                    f"is not equal to the "
                                                    f"number ({desired_num}) in expected "
                                                    f"pod list. Pod: {prefix} . "
                                                    f"Assuming DaemonSet replicas as expected pods number")
                                        desired_num = int(replicas_num)
                                    else:
                                        LOG.warning(f"Replicas num ({replicas_num}) from {kind_name} "
                                                    f"is not equal to the "
                                                    f"number ({desired_num}) in expected "
                                                    f"pod list. Pod: {prefix}")
                                    # PRODX-30560, DaemonSets may have different replicas
                                    # on different labs because of node taints or labels.
                                    # So do not fail the check until found a better solution.
                                    # failed[prefix] = {"object replicas": replicas_num,
                                    #                  "desired/expected": desired_num}
                            except (Exception, ApiException) as e:
                                LOG.error(e)
                                LOG.error(f"Cannot process {prefix} "
                                          f"group of pods. Skipping")
                                failed[prefix] = {"actual": 0,
                                                  "desired/expected": desired_num}
                                continue
                    if int(desired_num) != len(compare_list):
                        failed[prefix] = {"actual": len(compare_list),
                                          "desired/expected": desired_num}
                    not_checked = [x for x in not_checked
                                   if x not in [pod['metadata']['name']
                                                for pod in compare_list]]

                    actual_ns = set([pod['metadata']['namespace']
                                     for pod in compare_list])
                    if set([ns]) != actual_ns:
                        failed[prefix] = {"actual namespace": actual_ns,
                                          "desired/expected": ns}

            if other_ns_pods:
                for pod in other_ns_pods:
                    LOG.error(f"Extra pod {pod['metadata']['name']} "
                              f"found in {pod['metadata']['namespace']} "
                              f"namespace")
                    if check_all_nss:
                        not_checked.append(pod['metadata']['name'])

            if failed or not_checked:
                result = {"Pods mismatch": failed,
                          "Not checked pods": not_checked}
                LOG.warning(f"Compare pod check failed: {result}")
                return result

        if not expected_pods:
            if exclude_pods:
                expected_pods, do = self.cluster.get_expected_objects(
                    exclude_pods=exclude_pods)
            else:
                expected_pods = self.cluster.expected_pods

        try:
            waiters.wait(lambda: not compare(
                expected_pods_dict=expected_pods,
                exclude_jobs=exclude_jobs),
                timeout=timeout, interval=interval)
        except exceptions.TimeoutError:
            result = compare(expected_pods_dict=expected_pods,
                             exclude_jobs=exclude_jobs)
            if result:
                err = f"Timeout waiting for pods. " \
                      f"After {timeout}s there are some fails: " \
                      f"{result}"
                raise TimeoutError(err)
        LOG.info("All pods and their replicas are found")

    @retry(AssertionError, delay=60, tries=10, logger=LOG)
    def check_k8s_resources_after_update(self,
                                         cluster_resources_before,
                                         reboot_expected_nodes=None):
        reboot_expected_nodes = reboot_expected_nodes or {}

        # temporary exclude capacity/pods, check PRODX-11503
        # temporary exclude capacity/memory, check PRODX-18124
        # temporarily exclude feature.node.kubernetes.io/system-os_release*, check FIELD-5091
        # temporarily exclude system-os_release.ID
        # FIELD-5091 is fixed so OS is reported correctly and values are different from MKE 3.5.x
        # also, as for 2.25 releases, skip pstate
        labels_to_skip = {'capacity/pods',
                          'capacity/memory',
                          'labels/feature.node.kubernetes.io/',
                          }

        # exclude CPUID.SSE42 and CPUID.SSE4 labels as they may disappear during K8s upgrade
        # Also exclude iommu-enabled as it was deprecated in k8s: PRODX-30480
        gone_keys_to_skip = {'labels/feature.node.kubernetes.io/.*',
                             }
        new_keys_to_skip = {'labels/feature.node.kubernetes.io/.*',
                            }

        nodes_to_exclude = []
        for machine in self.cluster.get_machines():
            node_name = machine.get_k8s_node_name()
            if machine.is_disabled():
                nodes_to_exclude.append(node_name)
            if reboot_expected_nodes.get(node_name):
                # Generate per-node reboot skip list. Exactly 'node' name , not Machine name
                _reboot_skip_list = {f'node/{node_name}/labels/feature.node.kubernetes.io/kernel-version.full',
                                     f'node/{node_name}/labels/feature.node.kubernetes.io/kernel-version.major',
                                     f'node/{node_name}/labels/feature.node.kubernetes.io/kernel-version.minor'}
                labels_to_skip.update(_reboot_skip_list)

        LOG.info(f'Labels to skip: {labels_to_skip}')
        LOG.info("Collect cluster fixed resources after update")
        cluster_resources_after = self.cluster.describe_fixed_resources()

        LOG.debug(f"Cluster resources before update:\n"
                  f"{yaml.dump(cluster_resources_before)}")
        LOG.debug(f"Cluster resources after update:\n"
                  f"{yaml.dump(cluster_resources_after)}")

        before_keys = cluster_resources_before.keys()
        after_keys = cluster_resources_after.keys()

        common_keys = before_keys & after_keys
        new_keys = after_keys - before_keys
        gone_keys = before_keys - after_keys

        LOG.info("Check for NEW appeared objects")
        new_objects = {}
        for k in new_keys:
            if not any([re.compile('node/.*/' + skip_key).match(k) for skip_key in new_keys_to_skip]):
                new_objects[k] = cluster_resources_after[k]
        if new_objects:
            LOG.info(f"The following NEW objects appeared:\n{yaml.dump(new_objects)}")

        LOG.info("Check for OLD disappeared objects")
        gone_objects = {}
        for k in gone_keys:
            if (not any([re.compile('node/.*/' + skip_key).match(k) for skip_key in gone_keys_to_skip]) and
                    not (nodes_to_exclude and re.compile('node/(' + '|'.join(nodes_to_exclude) + ')').match(k))):
                gone_objects[k] = cluster_resources_before[k]

        LOG.info("Check that fixed resources were not changed after update")
        before = {}
        after = {}
        for common_label in common_keys:
            if not any([skip_label in common_label for skip_label in labels_to_skip]):
                before[common_label] = cluster_resources_before[common_label]
                after[common_label] = cluster_resources_after[common_label]

        before = yaml.dump(before)
        after = yaml.dump(after)

        assert before == after, (f"Some cluster resources were changed."
                                 f"\nBefore:\n{before}\n"
                                 f"\nAfter:\n{after}")
        assert not gone_objects, (f"The following OLD objects disappeared:"
                                  f"\n{yaml.dump(gone_objects)}")

    def get_ucp_worker_agent_name(self):
        """Find the service name for 'ucp-worker-agent-*' for the current MKE
        """
        def filter_ucp_worker_agent_service(service):
            service_name = service['Spec']['Name']
            return (
                service_name.startswith('ucp-worker-agent-') and
                'ucp-worker-agent-win' not in service_name)

        dashboardclient = self.cluster.mkedashboardclient
        docker_services = dashboardclient.get_swarm_services()

        filtered_services = list(filter(filter_ucp_worker_agent_service,
                                        docker_services))
        if len(filtered_services) < 1:
            raise Exception(f"Docker service 'ucp-worker-agent-*' not found "
                            f"on nodes of the cluster {self._cluster.name}")

        service_names = {
            s['Spec']['Labels']['com.docker.ucp.version']: s['Spec']['Name']
            for s in filtered_services
        }
        LOG.debug(f"Found the following worker agents:\n{service_names}")
        latest_ucp_version = max(service_names.keys())
        service_name = service_names[latest_ucp_version]
        LOG.info(f"Current name of ucp-worker-agent-* service for MKE version "
                 f"{latest_ucp_version} is: {service_name}")
        return service_name

    def check_actual_expected_docker_services(self,
                                              expected_services=None,
                                              excluded_services=None,
                                              changed_after_upd=None,
                                              expected_inactive_service_replicas=None,
                                              timeout=1200,
                                              interval=10):
        """Compare expected list of docker services (which are fetched
           automatically, unless explicitly provided) with actual
           list of docker services in this UCP cluster
           :param expected_inactive_service_replicas: dict of expected number of inactive service replicas
           """
        # storage for the status message from the latest check
        latest = {"status": ""}
        expected_inactive_service_replicas = expected_inactive_service_replicas or dict()

        if settings.SKIP_CHECK_ACTUAL_EXPECTED_DOCKER_SERVICES_CHECK:
            LOG.warning('Skipping check_actual_expected_docker_services')
            return

        def compare(expected_services, excluded_services):
            """Get actual/expected docker service replicas and
               compare it to the expected_services from tests
               or from si_tests/templates/expected-pods/<cluserrelease>.yaml

               :rtype bool: False means that the errors were found
                            True means the check passed
            """
            failed = {}
            service_replicas = {}

            try:
                dashboardclient = self.cluster.mkedashboardclient
                docker_services = dashboardclient.get_swarm_service_replicas()
            except exceptions.TimeoutError as e:
                LOG.warning(f"Got a TimeoutError from compare_twice():\n{e}")
                raise e
            except Exception as e:
                msg = f"Error reading docker swarm services:\n{e}"
                LOG.warning(msg)
                latest["status"] = msg
                self.cluster.docker_ps()
                return False

            for name, replicas in docker_services.items():
                LOG.debug(f"Docker service: {name}: {replicas}")
                r_actual = int(replicas["RunningTasks"])
                r_expected = int(replicas["DesiredTasks"])
                if r_expected == 0:
                    LOG.debug(f"Skip docker service '{name}' "
                              f"because in docker defined 0 replicas")
                    continue
                if name in excluded_services:
                    LOG.debug(f"Skip docker service '{name}' "
                              f"because it is in the skip list")
                    continue
                service_replicas[name] = {}
                service_replicas[name]["actual"] = r_actual
                service_replicas[name]["expected"] = r_expected

            not_checked = service_replicas.keys()

            for svc in expected_services.keys():
                if svc in excluded_services:
                    LOG.debug(f"Skip expected service '{svc}' "
                              f"because it is in the skip list")
                    continue

                num = int(expected_services[svc]) - int(expected_inactive_service_replicas.get(svc, 0))

                if svc not in service_replicas.keys():
                    failed[svc] = {"actual": 0,
                                   "test expected": num}
                    LOG.debug(f"Docker service '{svc}' not found "
                              f"while it is expected with '{num}' replicas")
                    continue

                r_actual = int(service_replicas[svc]["actual"])
                r_expected = int(service_replicas[svc]["expected"])
                if int(num) != r_expected or r_expected != r_actual:
                    failed[svc] = {"actual": r_actual,
                                   "docker expected": r_expected,
                                   "test expected": num}
                    LOG.debug(f"Docker service '{svc}' has wrong number "
                              f"of replicas: {failed[svc]}")

                not_checked = [x for x in not_checked if x != svc]

            if failed or not_checked:
                result = {"Docker services mismatch": failed,
                          "Not checked docker services": not_checked}
                LOG.warning(f"Compare docker services check failed: {result}")
                latest["status"] = result
                return False
            else:
                latest["status"] = ""
                LOG.info("Compare docker services check passed "
                         "(should not fail for 5 minutes)")
                return True

        def compare_twice(expected_services, excluded_services):
            """Ensure service statuses several times during 3 minutes

            If compare() method is passed - then repeat it, until
            timeout reached.
            TimeoutError means that the compare() method was
            never failed during the specified timeout, so return True
            """
            try:
                waiters.wait(lambda: not compare(
                    expected_services, excluded_services),
                    timeout=300, interval=20)
            except exceptions.TimeoutError:
                return True
            return False

        if not expected_services:
            docker_objects = self.cluster.expected_docker_objects
            expected_services = docker_objects["service"]
            if changed_after_upd:
                # proceed with a static copy of keys, to not replace keys twice
                expected_services_keys = list(expected_services.keys())
                for key in changed_after_upd:
                    if key in expected_services_keys:
                        key_data = expected_services[key]
                        del expected_services[key]
                        expected_services[changed_after_upd[key]] = key_data

        # list of excluded docker services
        excluded_services = excluded_services or []
        LOG.info(f"Skip checks for the following docker services: "
                 f"{excluded_services}")

        try:
            waiters.wait(lambda: compare_twice(
                expected_services, excluded_services),
                timeout=timeout, interval=interval)
        except exceptions.TimeoutError:
            if latest['status']:
                err = (f"Timeout waiting for docker services. "
                       f"After {timeout}s there are some fails: "
                       f"{latest['status']}")
            else:
                err = "Timeout waiting for docker services"
            raise TimeoutError(err)
        LOG.info("All docker services and their replicas were found")

    def __get_replica_number(self, obj, namespace):
        """Return desired number of replicas/scheduled pods according to
           the pod owner specification (e.g. statefulset).
        """
        kind, name = obj.split('/')
        desired_num = 0
        if kind == 'ReplicaSet':
            rs = self.k8sclient.replicasets.get(
                name=name, namespace=namespace).read().to_dict()
            desired_num = rs['metadata']['annotations'][
                'deployment.kubernetes.io/desired-replicas']
        elif kind == 'StatefulSet':
            ss = self.k8sclient.api_apps.read_namespaced_stateful_set(
                name=name, namespace=namespace).to_dict()
            desired_num = ss['spec']['replicas']
        elif kind == 'DaemonSet':
            ds = self.k8sclient.daemonsets.get(
                name=name, namespace=namespace).read().to_dict()
            desired_num = ds['status']['desired_number_scheduled']
        else:
            LOG.warning(f"Unknown kind {kind}")
        return desired_num

    def check_actual_expected_replicas_count(self, replicas_list, expected_replicas_count=1):
        """
        replicas_list: List of replicaset objects
        """
        checked_replicas = []
        for r in replicas_list:
            r_status = r.data.get('status') or {}
            current_replicas_count = r_status.get('replicas', None)
            if current_replicas_count == expected_replicas_count:
                checked_replicas.append(r)
            else:
                LOG.warning(f'Replicas count for replicaset {r.name} not as expected. \n'
                            f'Current replicas {current_replicas_count}, expected: {expected_replicas_count}')
        if len(checked_replicas) == len(replicas_list):
            LOG.info("All replicasets has replicas count as expected")
            return True
        LOG.warning("Not all replicasets has expected replicas count")
        return False

    def check_deployments_ready(self, deployments_list):
        for d in deployments_list:
            if d.wait_ready():
                continue
            else:
                LOG.error(f'Deployment {d} is not ready')
                return False
        LOG.info(f'All deployments in: {deployments_list }have status ready')
        return True

    def check_cluster_deleted(self, timeout=1800, interval=15):
        waiters.wait(lambda: not self.cluster.is_existed(),
                     timeout=timeout, interval=interval,
                     timeout_msg='Timeout waiting for cluster deletion')
        LOG.info(f"Cluster {self.cluster.name} has been deleted")

    def _get_cluster_readiness(self,
                               exp_provider_status=True,
                               expected_fails=None,
                               expected_condition_fails=None):
        """Get conditions from Cluster and Machine objects

        :rtype bool: True only if all conditions are true, False in other case
        """
        machines_result = [m.are_conditions_ready(expected_condition_fails,
                                                  verbose=True)
                           for m in self.cluster.get_machines() if not m.is_disabled()]
        cluster_result = [
            self.cluster.are_conditions_ready(expected_condition_fails,
                                              verbose=True),
            self.cluster.is_ready(exp_provider_status, expected_fails)]

        return all(machines_result + cluster_result)

    def _get_cluster_status_msg(self):
        """To get latest status of the Cluster and Machine objects"""
        status = self.cluster.data['status']['providerStatus']
        status_keys = [
            'bastion',
            'conditions',
            'helm',
            'kind',
            'loadBalancerHost',
            'loadBalancerStatus',
            'nodes',
            'notReadyObjects',
            'ready',
        ]
        cluster_status = {k: v for k, v in status.items() if k in status_keys}
        machines = {m.name: (m.data.get('status') or {}).get('providerStatus', {})
                    for m in self.cluster.get_machines()}
        machines_msg = []
        for name, status in machines.items():
            prefix = f"Machine {self.cluster.namespace}/{name}"
            if 'conditions' not in status:
                msg = f"{prefix}: NO CONDITIONS IN PROVIDER STATUS"
                machines_msg.append(msg)
            else:
                errors = [f"{c['type']} condition: \"{c['message']}\""
                          for c in status['conditions'] if not c['ready']]
                if errors:
                    machines_msg.append(f"[ {prefix} ] " + "; ".join(errors))

        machines_str = "\n".join(machines_msg)
        return (f"Cluster data status:\n{yaml.dump(cluster_status)}\n"
                f"{machines_str}")

    def check_cluster_readiness(
            self,
            timeout=settings.CHECK_CLUSTER_READINESS_TIMEOUT,
            interval=60,
            exp_provider_status=True,
            expected_fails=None,
            expected_condition_fails=None):
        """
        Check that overall cluster is Ready
        and no unexpected deployments failed
        :param timeout: timeout for waiter
        :type timeout: int
        :param interval: interval to check status
        :type interval: int
        :param exp_provider_status: expected overall cluster status
        :type exp_provider_status: bool
        :param expected_fails: list of strings with
        expectedly failed deployment names
        :type expected_fails: List[str]
        :param expected_condition_fails: dict with conditions like
            { <condition type>: <part of condition message to match> , }
        :type expected_condition_fails: Dict[str]

        :rtype bool: bool
        """
        LOG.info(f"Checking readiness for cluster '{self.cluster.namespace}/{self.cluster.name}'")
        expected_fails = expected_fails or []

        timeout_msg = "Timeout waiting for cluster readiness"
        try:
            waiters.wait(
                lambda: self._get_cluster_readiness(
                    exp_provider_status,
                    expected_fails,
                    expected_condition_fails),
                timeout=timeout,
                interval=interval,
                timeout_msg=timeout_msg,
                status_msg_function=self._get_cluster_status_msg)
        except exceptions.TimeoutError as e:
            if ['Ceph'] == collect_cluster_readiness.readiness.get('not_ready', []):
                waiters.wait(
                    lambda: self._get_cluster_readiness(
                        exp_provider_status,
                        expected_fails,
                        expected_condition_fails),
                    timeout=600,
                    interval=60,
                    timeout_msg=timeout_msg,
                    status_msg_function=self._get_cluster_status_msg)
            else:
                # Raise exception if not only Ceph
                raise e

        LOG.info("Cluster checked")

    def check_diagnostic_cluster_status(self, timeout=600, interval=10, excluded_checks_names=None):
        """
        This check creates diagnostic object and verifying checks results

        :param timeout: Timeout for wating diagnostic object has finished checks
        :param interval: interval for waiter
        :param exclude_checks_names: List of checks names to exclude from final result check
        :return: None
        """

        # Check that diagnostic is installed. If not then skip the check
        if len(self._cluster._manager.get_diagnosticreleases()) == 0:
            LOG.warning("No diagnostic release installed. Skipping check")
            return

        if not settings.MOSK_CHILD_RUN_DIAGNOSTIC_CHECK:
            LOG.warning(f"Skip diagnostic check "
                        f"as MOSK_CHILD_RUN_DIAGNOSTIC_CHECK "
                        f"is {settings.MOSK_CHILD_RUN_DIAGNOSTIC_CHECK}")
            return

        if not excluded_checks_names:
            excluded_checks_names = []
        else:
            LOG.warning(f"Next checks are excluded and will not count for final result:\n"
                        f"{yaml.dump(excluded_checks_names)}")

        def is_finished(diagnostic_object):
            status = diagnostic_object.data.get('status', {}) or {}
            f_at = status.get('finishedAt', 0)
            return True if f_at else False

        cluster_name = self.cluster.name
        cluster_namespace = self.cluster.namespace
        fail_statuses = ['ERROR', 'FAIL']
        LOG.info(f"Creating diagnostic object to check cluster {cluster_namespace}/{cluster_name}")
        test_obj_name = self.cluster.name + "-" + utils.gen_random_string(10)
        diagnostic_obj = self.cluster.create_diagnostic_object(object_name=test_obj_name)
        waiters.wait(lambda: is_finished(diagnostic_obj), timeout=timeout, interval=interval)
        checks_results = diagnostic_obj.data.get('status', {}).get('result', {})
        failed_checks = {check_name: result for check_name, result in checks_results.items() if
                         result.get('result', '') in fail_statuses and check_name not in excluded_checks_names}
        assert not failed_checks, f"Next cluster checks are failed:\n{yaml.dump(failed_checks)}"
        LOG.info(f"Cluster {cluster_name} in namespace {cluster_namespace} checked. "
                 f"Diagnostic checks result:\n{yaml.dump(checks_results)}")

    def check_cluster_status(self, exp_cluster_status="Ready", timeout=600, interval=60):
        """
        Wait expected cluster Phase

        :param interval: interval for checks
        :param exp_cluster_status:  ('Pending', 'Updating', 'Ready')
        :param timeout: timeout for waiter
        """
        waiters.wait(
            lambda: self.cluster.cluster_status == exp_cluster_status,
            timeout=timeout,
            interval=interval,
            timeout_msg=f"Timeout waiting for cluster status is {exp_cluster_status}",
        )

    def check_ntp_status(self, previous_reference=None):
        """
        Check that NTP on nodes in sync and have same reference ID
        previous_reference optional param enchancing check with NTP change verification

        Will fail if case if NTP servers not set (default pools used) because it's unpredicable what server will be
        chosen for each node by pool.


        :param previous_reference: dict with information about previous reference IDs of NTP on nodes
               Example:
               {
                 'node-0': '450ADF83 (69-10-223-131.rainierconnect.com)',
                 'node-1': '3351D1E8 (vps-4e90522b.vps.ovh.us)',
                 'node-2': '4559CF63 (ntp1.wiktel.com)'
               }
        :return:
        """
        LOG.info(f"Checking NTP status for cluster {self.cluster.namespace}/{self.cluster.name}")
        machines = self.cluster.get_machines()
        ref_id = ''
        for m in machines:
            LOG.info(f"Checking NTP on machine {m.namespace}/{m.name}")
            chr_status = m.get_chrony_status()

            # Expect that Reference ID of chrony is the same for all nodes.
            # Get first value on first machine in a list as source
            if not ref_id:
                ref_id = chr_status['Reference ID']
                LOG.info(f"Using reference ID '{ref_id}' as a source")
            else:
                assert chr_status['Reference ID'] == ref_id, (f"Chrony Reference ID has unexpected "
                                                              f"value on machine {m.namespace}/{m.name}. "
                                                              f"Expected: {ref_id};"
                                                              f"Actual: {chr_status['Reference ID']}")

            if previous_reference:
                # Verify that NTP changed
                assert chr_status['Reference ID'] != previous_reference[m.name], \
                    f"Machine {m.namespace}/{m.name} Chrony Reference ID does not changed"
            # Verify that NTP in sync
            assert chr_status['Leap status'] == 'Normal', (f"NTP sync status on machine "
                                                           f"{m.namespace}/{m.name} is not 'Normal'. "
                                                           f"Status: {chr_status['Leap status']}")
            LOG.info(f"Machine {m.namespace}/{m.name} checked")
        LOG.info('NTP check finished successfully')

    def check_no_unexpected_lcm_phases_reexecuted(self, prev_timestamps, allowed_phases=None):
        """
        Used to check that timestamps not changed for stateitems. it is possible to skip phases that expected
        to be executed

        :param prev_timestamps: Output of cluster.get_cluster_lcmmachines_timestamps() executed before
        :param allowed_phases: List of phases that expected to be executed (timestamps for it can be changed)
        """
        if allowed_phases is None:
            allowed_phases = []
        LOG.info(f"Starting lcm phases check. Expected phases allowed to run: {allowed_phases}")
        timestamps = self.cluster.get_cluster_lcmmachines_timestamps()
        machines = self.cluster.get_machines()
        for m in machines:
            LOG.info(f"Checking machine {m.namespace}/{m.name}")
            for phase in timestamps[m.name]['phases']:
                if phase not in allowed_phases:
                    LOG.info(f"Checking phase '{phase}'")
                    # Verify that completion time does not changed
                    assert (timestamps[m.name]['phases'][phase]['finishedAt'] ==
                            prev_timestamps[m.name]['phases'][phase]['finishedAt']), \
                        f"{phase} was unexpectedly executed at machine {m.name}"
            LOG.info(f"Machine {m.namespace}/{m.name} checked")
        LOG.info('Phase check finished')

    def check_offline_isolation(self, proxy_access_str):
        resources_to_check = [
            # Binaries
            "https://binary.mirantis.com",
            # Docker images
            "http://mirantis.azurecr.io",
            # APT mirror
            "https://mirror.mirantis.com",
            # Docker Engine mirror
            "https://repos.mirantis.com",
        ]
        cert_cmd = ""
        certificate_param = ""

        if settings.KAAS_SSL_PROXY_CERTIFICATE_FILE:
            proxy_name = self.cluster.spec['providerSpec']['value']['proxy']
            if not self.cluster.is_management:
                proxy_obj = self.cluster.get_parent_cluster().k8sclient.kaas_proxies.get(
                    proxy_name, self.cluster.namespace)
            else:
                proxy_obj = self.cluster. k8sclient.kaas_proxies.get(proxy_name, self.cluster.namespace)
            proxy_cert_b64 = proxy_obj.data['spec']['caCertificate']
            cert_cmd = f'echo {proxy_cert_b64}|base64 -d > custom_proxy_cert;'
            certificate_param = "--cacert custom_proxy_cert --insecure"

        machines = self.cluster.get_machines()
        machines_to_check = {machine.machine_type: machine
                             for machine in machines}
        for machine_type, machine in machines_to_check.items():
            LOG.info(f"Offline isolation check on machine [{machine_type}] "
                     f"{self._cluster.name}/{machine.name}")

            # Negative check: direct access from the offline node
            # to external resources should fail.
            # * "hostname -s" is used because equinix labs has wrong name
            #   order in /etc/hosts: "hostname -f" returns "localhost"
            for resource in resources_to_check:
                negative_cmd = (
                    f"set -x; hostname -s;"
                    f"unset HTTP_PROXY; unset HTTPS_PROXY;"
                    f"unset http_proxy; unset https_proxy;"
                    f"curl --connect-timeout 10 -sS {resource}"
                )
                result = machine.exec_pod_cmd(negative_cmd, verbose=False)
                assert result['exit_code'] != 0, (
                    f"No offline isolation on the machine "
                    f"{self._cluster.name}/{machine.name}:"
                    f"\n{result['logs']}")
                assert (
                    "Connection timed out" in result['logs'] or
                    "Failed connect to" in result['logs'] or
                    "Failed to connect to" in result['logs'] or
                    "Could not resolve host" in result['logs'] or
                    "Resolving timed out" in result['logs'] or
                    "Connection timeout" in result['logs']
                ), (
                    f"Unexpected result while checking offline isolation "
                    f"on the machine {self._cluster.name}/{machine.name}:"
                    f"\n{result['logs']}")

            if proxy_access_str:
                # Positive check: access from the offline node to
                # an external resource using the specified proxy should pass
                positive_cmd = (
                    f"set -x; hostname -s;"
                    f"{cert_cmd}"
                    f"curl --connect-timeout 10 --retry 3 --retry-delay 10 -sS -x '{proxy_access_str}' "
                    f"{resources_to_check[0]} "
                    f"{certificate_param}"
                )
                result = machine.exec_pod_cmd(positive_cmd, verbose=False)
                assert result['exit_code'] == 0, (
                    f"No proxy access from the machine "
                    f"{self._cluster.name}/{machine.name}:"
                    f"\n{result['logs']}")
            else:
                LOG.warning("Skip external access check through proxy "
                            "because proxy setting is empty")

            LOG.info(f"Offline isolation check on machine [{machine_type}] "
                     f"{self._cluster.name}/{machine.name}: PASSED")

    @staticmethod
    @retry((AssertionError, TimeoutError), delay=60, tries=10, logger=LOG)
    def check_cert_conversation(host, port=443):
        try:
            ssl_connection_setting = Context(SSLv23_METHOD)
        except ValueError:
            ssl_connection_setting = Context(TLSv1_2_METHOD)
        ssl_connection_setting.set_timeout(5)
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            s.connect((host, port))
            c = Connection(ssl_connection_setting, s)
            c.set_tlsext_host_name(str.encode(host))
            c.set_connect_state()
            c.do_handshake()
            cert = c.get_peer_certificate()
            LOG.info(f'Certificate: {cert}')
            subject_list = cert.get_subject().get_components()
            LOG.info(f'Subject list: {subject_list}')
            cert_byte_arr_decoded = {}
            for item in subject_list:
                cert_byte_arr_decoded.update(
                    {item[0].decode('utf-8'): item[1].decode('utf-8')})
            LOG.info(f"Decoded cert byte array: {cert_byte_arr_decoded}")

            assert len(cert_byte_arr_decoded) > 0
            assert cert_byte_arr_decoded["CN"] == host, \
                'Cert common name is not equal to expected one'

    def check_cert_equal(self, port=443, cert=None):
        LOG.info("Check cert equal")
        cmd = f"openssl s_client -connect 127.0.0.1:{port} 2>/dev/null | openssl x509"
        machines = self.cluster.get_machines(machine_type='control')
        if not cert:
            cert = machines[0].exec_pod_cmd(cmd)['logs'][:-1]
        invalid_cert = []
        for m in machines:
            LOG.debug(f"{m.name}  - cert: ")
            local_cert = m.exec_pod_cmd(cmd)['logs']
            if local_cert[:-1] != cert:
                invalid_cert.append(m.name)
        assert not invalid_cert, f"Nodes with invalid cert: {invalid_cert}"

    @retry(AssertionError, delay=30, tries=10, logger=LOG)
    def validate_machines_mcc_ca(self, ca):
        machines = self.cluster.get_machines()
        mcc_ca_cmd = "sudo cat /usr/local/share/mcc/mcc.crt"

        for m in machines:
            mcc_ca_file = m.exec_pod_cmd(mcc_ca_cmd)['logs']
            assert ca in mcc_ca_file, f"Node with invalid mcc.crt: {m.name}"

    @retry(AssertionError, delay=30, tries=10, logger=LOG)
    def validate_agent_kubeconfig(self, ca):
        if self.cluster.provider == utils.Provider.byo:
            LOG.info(f"Provider '{self.cluster.provider}' doesn't run agents on nodes. "
                     f"Skipping agent checks")
            return
        LOG.info(f"Checking agent kubeconfigs in cluster {self.cluster.name} contain custom CA")
        machines = self.cluster.get_machines()
        kubeconfig_path_cmd = "sudo cat /etc/lcm/environment | grep KUBECONFIG | cut -f2 -d ="

        for m in machines:
            kubeconfig_path = m.exec_pod_cmd(kubeconfig_path_cmd)['logs']
            ca_cmd = f"sudo cat {kubeconfig_path} | grep certificate-authority-data " \
                     f"| cut -f2 -d : | tr -d ' ' | base64 -d"
            kubeconfig_ca = m.exec_pod_cmd(ca_cmd)['logs']
            assert kubeconfig_ca
            assert ca in kubeconfig_ca, f"Node with invalid kubeconfig: {m.name}"

    @retry(AssertionError, delay=30, tries=10, logger=LOG)
    def validate_helm_controller_secret(self, ca):
        LOG.info("Checking helm-controller's ca.crt contains custom CA")
        helm_secret = self.cluster.k8sclient.secrets.get(
            name="helm-controller-secret", namespace="kube-system").read()
        ca_bundle_encoded = helm_secret.to_dict()['data']['ca.crt']
        ca_bundle = base64.b64decode(ca_bundle_encoded).decode("utf-8")
        assert ca in ca_bundle, \
            f"Helm controller for cluster {self.cluster.name} does not contain a valid ca"

    def check_actual_expected_ceph_pods(self, scale_machines_names_list=None,
                                        timeout=1200, interval=120):
        machines_to_count = []
        if self.cluster.provider not in utils.Provider.with_ceph():
            LOG.info(f"Provider '{self.cluster.provider}' doesn't support Ceph. "
                     f"Skipping Ceph pods checks")
            return True
        elif scale_machines_names_list and self.cluster.provider == utils.Provider.equinixmetal:
            # Check if node is used for ceph
            # If Ceph configuration is not manual, then all machines are used
            # for Ceph. Else, need to check this
            if self.cluster.data.get('spec', {}).get(
                'providerSpec', {}).get('value', {}).get('ceph', {}).get(
                    'manualConfiguration', False):
                for machine in scale_machines_names_list:
                    machine_obj = self.cluster.get_machine(machine)
                    storage = machine_obj.data.get(
                        'spec').get('providerSpec').get(
                        'value').get('ceph', {})
                    if storage.get('managerMonitor', False) \
                            or storage.get('storage', False):
                        machines_to_count.append(machine)
                    else:
                        LOG.info(f"Machine {machine} was not used for Ceph. "
                                 f"Skipping")
            else:
                LOG.info(f"All of scale machines {scale_machines_names_list} "
                         f"are used for Ceph, because "
                         f"Ceph configuration is not manual")
                machines_to_count.extend(scale_machines_names_list)
        elif scale_machines_names_list and self.cluster.provider == utils.Provider.baremetal:
            # Need to check is scale node is used for Ceph or not
            # For BM Provider this is done by labeling node as storage
            for machine in scale_machines_names_list:
                machine_obj = self.cluster.get_machine(machine)
                if 'storage' in machine_obj.data.get('metadata', {}).get(
                        'labels'):
                    machines_to_count.append(machine)

        def check_machines_in_cephcluster():
            is_machine_added = True
            if self.cluster.workaround.skip_kaascephcluster_usage():
                ceph_machines = [node['name'] for node in
                                 self.cluster.get_miracephcluster().data.get(
                                     'spec', {}).get('nodes', [])]

                for machine in machines_to_count:
                    if machine.status['instanceName'] in ceph_machines:
                        LOG.info(f"Machine {machine} was successfully "
                                 f"added to kaascephcluster")
                        continue
                    else:
                        LOG.info(f"Machine {machine} is not existed "
                                 f"in kaascephcluster yet")
                        is_machine_added = False
            else:
                ceph_machines =\
                    self.cluster.get_kaascephcluster().data.get(
                        'spec', {}).get('cephClusterSpec', {}).get(
                        'nodes', {}).keys()

                for machine in machines_to_count:
                    if machine in ceph_machines:
                        LOG.info(f"Machine {machine} was successfully "
                                 f"added to kaascephcluster")
                        continue
                    else:
                        LOG.info(f"Machine {machine} is not existed "
                                 f"in kaascephcluster yet")
                        is_machine_added = False

            return is_machine_added

        if machines_to_count:
            # Wait for machines are added to kaascephcluster
            waiters.wait(
                check_machines_in_cephcluster,
                timeout=timeout,
                interval=interval)

        def check_ceph_pods_number():
            expected_osd_pod_num = 0
            expected_mon_pod_num = 0
            if machines_to_count:
                # For equinix provider all of disks (except boot) are used
                # for Ceph if node is used for Ceph. So we need to check
                # this condition too. Check that free disks number is equal
                # storage devices in kaascephcluster after adding new node
                if self.cluster.provider in [utils.Provider.equinixmetal, utils.Provider.equinixmetalv2]:
                    disks_missmatch = {}
                    for machine in machines_to_count:
                        # Get free disks
                        machine_data = self.cluster.get_machine(
                            machine).data
                        free_disks = [d for d in machine_data.get(
                            'status', {}).get('providerStatus', {}).get(
                            'hardware', {}).get('storage', {}) if not d.get(
                            'isBoot', False)]
                        # Get Ceph disks from kaascephcluster
                        kaascephcluster_storage =\
                            self.cluster.get_kaascephcluster().data.get(
                                'spec', {}).get('cephClusterSpec', {}).get(
                                'nodes', {}).get(machine, {}).get(
                                'storageDevices', [])
                        # Check it matches
                        if len(free_disks) != len(kaascephcluster_storage):
                            disks_missmatch[machine] = {
                                'node_disks': free_disks,
                                'ceph_disks': kaascephcluster_storage}
                    if disks_missmatch:
                        raise Exception(
                            f"Disks and osds count missmatch "
                            f"for next nodes: {disks_missmatch.keys()}."
                            f"\n{yaml.dump(disks_missmatch)}")

            missmatch = {}
            if self.cluster.workaround.skip_kaascephcluster_usage():
                ceph_cluster = self.cluster.get_miracephcluster()
                ceph_nodes = ceph_cluster.data.get('spec', {}).get('nodes', [])
            else:
                ceph_cluster = self.cluster.get_kaascephcluster()
                ceph_nodes = ceph_cluster.data.get(
                    'spec', {}).get('cephClusterSpec', {}).get('nodes', {})
            if self.cluster.workaround.skip_kaascephcluster_usage():
                for node in ceph_nodes:
                    stordev = [dev.get('name') for dev in node.get('devices', [])]
                    expected_osd_pod_num += len(stordev)
                    if 'mon' in node.get('roles', []):
                        expected_mon_pod_num += 1
            else:
                for k, v in ceph_nodes.items():
                    stordev = [dev.get('name') for dev in v.get(
                        'storageDevices', [])]
                    expected_osd_pod_num += len(stordev)
                    if 'mon' in v.get('roles', []):
                        expected_mon_pod_num += 1

            actual_osd_pods = [
                pod for pod in self.cluster.k8sclient.pods.list_starts_with(
                    pattern='rook-ceph-osd-', namespace='rook-ceph') if
                'osd-prepare' not in pod.name]
            actual_mon_pods = [
                pod for pod in self.cluster.k8sclient.pods.list_starts_with(
                    pattern='rook-ceph-mon-', namespace='rook-ceph')]
            actual_osd_pods_num = len(actual_osd_pods)
            actual_mon_pods_num = len(actual_mon_pods)
            mssg = "Missmatch between expected and actual Ceph pods number."
            if actual_osd_pods_num != expected_osd_pod_num:
                mssg += (f" Current osds count {actual_osd_pods_num}, "
                         f"but should be {expected_osd_pod_num}.")
                missmatch['osd'] = actual_osd_pods_num

                if actual_mon_pods_num != expected_mon_pod_num:
                    mssg += (f" Current mons count {actual_mon_pods_num}, "
                             f"but should be {expected_mon_pod_num}.")
                    missmatch['mon'] = actual_mon_pods_num
            if missmatch:
                LOG.error(mssg)
                return False
            else:
                LOG.info("Current osds and mons pods number as expected")
                return True

        waiters.wait(lambda: check_ceph_pods_number(), timeout=timeout,
                     interval=interval)

    def check_pod_container_expected_resources(
            self, pod_name, pod_ns, pod_container_name=None,
            expected_resources=None):
        """
        Compare existed pod resources and expected.
        expected_resources should be in next format:
          limits:
            cpu: "3"
            memory: 2Gi
          requests:
            cpu: "2"
            memory: 1Gi

        Use None if you expect no resources. For example:
          limits: None
          requests: None
        """

        pod = self.cluster.k8sclient.pods.get(
            name=pod_name, namespace=pod_ns)
        pod_name = pod.name
        if not pod_container_name:
            assert len(pod.containers) == 1, (
                f"Container name is not set and there are multiple containers "
                f"in pod {pod_name}. Please, choose one of "
                f"{[c['name'] for c in pod.containers]}")
            pod_container_name = pod.containers[0]['name']

        if not expected_resources:
            LOG.info(f"Expected resources are not set for pod {pod_name}. "
                     f"Skipping checks. "
                     f"Current resources: "
                     f"\n{yaml.dump(pod.containers_resources)}")
            return True

        resources = pod.containers_resources
        expected_limits = expected_resources.get('limits', {})
        expected_requests = expected_resources.get('requests', {})
        container_data = [c for c in resources['containers'] if
                          c['name'] == pod_container_name]
        assert container_data, (f"Container {pod_container_name} not found "
                                f"for pod {pod_name}")
        container = container_data[0]
        container_limits = container.get('resources').get('limits')
        container_requests = container.get('resources').get('requests')
        if expected_limits == container_limits:
            if expected_requests == container_requests:
                LOG.info(f"All resources for container {pod_container_name} "
                         f"in pod {pod_name} as expected")
                return True
            else:
                LOG.error(f"Limits as expected, but requests are not as "
                          f"expected for container {pod_container_name} "
                          f"in pod {pod_name}")
                LOG.debug(f"\nExpected resources: "
                          f"\n{yaml.dump(expected_resources)} "
                          f"\nCurrent resources: "
                          f"\n{yaml.dump(resources)}")
        else:
            if expected_requests != container_requests:
                LOG.error(f"Limits and requests are not as expected "
                          f"for container {pod_container_name} in pod "
                          f"{pod_name}")
                LOG.debug(f"\nExpected resources: "
                          f"\n{yaml.dump(expected_resources)} "
                          f"\nCurrent resources: "
                          f"\n{yaml.dump(resources)}")
            else:
                LOG.error(f"Limits are not as expected for container "
                          f"{pod_container_name} in pod {pod_name}")
                LOG.debug(f"\nExpected resources: "
                          f"\n{yaml.dump(expected_resources)} "
                          f"\nCurrent resources: "
                          f"\n{yaml.dump(resources)}")
        return False

    def compare_supported_and_existed_clusterreleases(self):
        if not self.cluster.is_management:
            LOG.info("KaasRelease and ClusterRelease objects available "
                     "only for mgmt cluster")
            return None
        lost_releases = []
        clusterreleases_objects = \
            self.cluster.k8sclient.kaas_clusterreleases.list_all()
        clusterreleases_objects_names = [
            cl_r.name for cl_r in clusterreleases_objects]
        LOG.debug(f"Clusterreleases objects names: "
                  f"{clusterreleases_objects_names}")
        supported_clusterreleases = \
            self.cluster._manager.get_supported_clusterreleases(self.cluster.provider.provider_name)
        supported_clusterreleases_names = \
            [v.get('name', '') for k, v in supported_clusterreleases.items()]
        LOG.debug(f"Supported clusterreleases names: "
                  f"{supported_clusterreleases_names}")
        for name in supported_clusterreleases_names:
            if name not in clusterreleases_objects_names:
                lost_releases.append(name)
        if lost_releases:
            raise Exception(f"Next releases are announced as Supported, but "
                            f"clusterrelease objects for these releases are "
                            f"not existed: {lost_releases}")

    def check_actual_expected_rolebindings(self, timeout=300, interval=10):
        # Check posisble on cluster with mke-7-2-0 and larger
        cluster_release = self.cluster.clusterrelease_version
        LOG.info(f"Cluster release: {cluster_release}")

        def compare(expected_rolebinds_dict: dict):
            failed = {}
            namespaces = expected_rolebinds_dict.keys()
            actual_list = [rolebind for rolebind in
                           self.k8sclient.rolebindings.list_raw().to_dict()['items']]

            # constant
            excluded_rolesbinds = []
            excluded_rolesbinds += self.EXCLUDED_ROLESBINDS
            # special exclusions
            for one_excl in excluded_rolesbinds:
                actual_list = [x for x in actual_list
                               if not x['metadata']['name'].startswith(one_excl)]

            actual_list = [rolebind for rolebind in actual_list
                           if rolebind['metadata']['namespace'] in namespaces]

            not_checked = [(rolebind['metadata']['name'], rolebind['metadata']['namespace'])
                           for rolebind in actual_list]

            for ns in namespaces:
                expected_rolebinds_lst = sorted(
                    list(expected_rolebinds_dict[ns].keys()), reverse=True
                )
                for rolebind in expected_rolebinds_lst:
                    roleref = expected_rolebinds_dict[ns][rolebind]
                    compare_rolebind = next((rb for rb in actual_list if
                                             rb['metadata']['name'].startswith(rolebind) and
                                             rb['metadata']['namespace'] == ns and
                                             (rb['metadata']['name'], rb['metadata']['namespace']) in not_checked),
                                            None)

                    if not compare_rolebind:
                        failed[rolebind] = {"actual": None,
                                            "desired/expected": roleref,
                                            "namespace": ns}
                        continue

                    # desired_kind, desired_rolename = roleref.split('/')
                    compare_rolebind_roleref = compare_rolebind['role_ref']
                    compare_rolebind_roleref = (f"{compare_rolebind_roleref['kind']}/"
                                                f"{compare_rolebind_roleref['name']}")
                    if roleref != compare_rolebind_roleref:
                        failed[rolebind] = {"actual": compare_rolebind_roleref,
                                            "desired/expected": roleref,
                                            "namespace": ns}

                    not_checked = [x for x in not_checked
                                   if x != (rolebind, ns)]

            if failed or not_checked:
                result = {"RoleBind mismatch": failed, "Not checked rolebinds": not_checked}
                LOG.warning(f"Compare rolebindings check failed: {result}")
                return result

        expected_rolebindings = self.cluster.get_expected_rolebindings()

        try:
            waiters.wait(lambda: not compare(
                expected_rolebinds_dict=expected_rolebindings),
                timeout=timeout, interval=interval)
        except exceptions.TimeoutError:
            result = compare(expected_rolebinds_dict=expected_rolebindings)
            if result:
                err = f"Timeout waiting for rolebindings. After {timeout}s there are some fails: {result}"
                raise TimeoutError(err)
            LOG.info("All expected rolebindings are placed")

    def get_machine_repository_url(self, machine):
        """
        Get /etc/apt/sources.list for machine
        Args:
            machine: machine class

        Returns: output for the executed command

        """
        output = machine.exec_pod_cmd("grep deb /etc/apt/sources.list", verbose=False)['logs']
        return output

    def check_repository_url(self):
        """
        Check repository url for each machine

        Returns: None or Exception

        """
        def _check_url(machines, machine_type):
            """
            Check if repository url (separate for each machine type) contains expected url for each machine.
            Args:
                machines: list of machine objects
                machine_type: machine type

            Returns: list of machines that not match

            """
            not_match = []

            for machine in machines:
                LOG.info(f"Get repository url for {machine_type} machine {machine.name}")
                if machine.is_disabled():
                    LOG.info(f"Machine {machine.name} is disabled. Skipping")
                    continue
                output = self.get_machine_repository_url(machine)
                result = re.findall(r"https://[^\s]*", output)
                # check all except the last one as it relates to kubernetes-extra repo
                if not all(line.endswith(repo_url[machine_type]) for line in result[:-1]):
                    not_match.append(machine.name)
                    LOG.info(
                        f"Expected repository url is '{repo_url[machine_type]}' "
                        f"but machine {machine.name} has:\n {output}")
            return not_match

        repo_url = self.cluster.get_cluster_repository_url()
        control_machines = self.cluster.get_machines(machine_type='control')
        worker_machines = self.cluster.get_machines(machine_type='worker')

        errors = _check_url(control_machines, 'control') + _check_url(worker_machines, 'worker')

        if errors:
            msg = f"Next machine has incorrect repository url: {', '.join(errors)}"
            raise Exception(msg)

    def expect_kernel_changed_between_versions(self, cr_previous=None):
        """Compare expected kernel versions with previous and current clusterreleases,
        for case, where we dont have allowedDistributions """
        kernel_changed = False
        expected_versions_map_previous = self.cluster.get_expected_kernel_version(target_cr=cr_previous)
        expected_versions_map_current = self.cluster.get_expected_kernel_version()
        for k, v in expected_versions_map_current.items():
            if expected_versions_map_previous.get(k) is not None:
                if expected_versions_map_previous[k] != expected_versions_map_current[k]:
                    expected_versions_map_previous_formatted = [
                        el for el in expected_versions_map_previous[k] if 'notgreenfield' not in el]
                    expected_versions_map_current_formatted = [
                        el for el in expected_versions_map_current[k] if 'notgreenfield' not in el]
                    if expected_versions_map_previous_formatted != expected_versions_map_current_formatted:
                        LOG.debug(f"Expected version current: {expected_versions_map_current_formatted}")
                        LOG.info(f"Expected version previous {expected_versions_map_previous_formatted}")
                        kernel_changed = True
                        LOG.info(f"Kernels should be changed between upgrade "
                                 f"({cr_previous} -> {self.cluster.clusterrelease_version}) "
                                 f"from {expected_versions_map_previous_formatted} "
                                 f"to {expected_versions_map_current_formatted} for {k}")

        return kernel_changed

    def check_actual_expected_kernel_versions(self, is_postpone_distribution_upgrade_enabled=False):
        """
        Compare expected kernel version with actual, per machine
        TODO(alexz): part with "allowedDistributions" still on WIP phase, and upcoming changes will be
        integrated later. New logic will not affect old one style, due hide under FF.
        """
        if settings.SKIP_EXPECTED_KERNEL_VERSIONS_CHECK:
            LOG.warning('check_actual_expected_kernel_versions: skipped')
            return
        expected_versions_map = self.cluster.get_expected_kernel_version()
        # SI-tests now will be able to work only with 2.16+ releases
        assert "allowedDistributions" in expected_versions_map.keys(), \
            "allowedDistributions not found in expected_versions_map from test data."

        LOG.warning("'allowedDistributions' extra mapping detected, per-machine logic will be used")
        test_allowed_dists = expected_versions_map['allowedDistributions']
        #
        kaas_release_name = self.cluster.clusterrelease_version
        kaas_release_allowed_distr = self._cluster._manager.get_clusterrelease(name=kaas_release_name
                                                                               ).data['spec']['allowedDistributions']
        kaas_release_default_dist = [d for d in kaas_release_allowed_distr if d.get('default')]
        assert len(kaas_release_default_dist) == 1, \
            f"Expected to have only one default distrib in release!ClusterRelease-map:" \
            f"\n{kaas_release_allowed_distr}\n is broken!"

        LOG.info(f"ClusterRelease:{kaas_release_name} have allowedDistributions:\n{kaas_release_allowed_distr}")
        perhostdata = self.cluster.get_nodes_kernel_and_tarfs_versions()
        # TODO(alexz): previous information required to determine initial tarfs. Currently its not possible to do.
        # corresponding changes will be integrated into lcm-agent(?) and test will be refactored accordingly.
        # Dummy check, is cluster been upgraded
        is_upgraded = False
        previous_clusterrelease = self.cluster.data['status'].get('providerStatus', {}).get(
            'releaseRefs', {}).get('previous', {}).get('name', {})
        if previous_clusterrelease:
            LOG.warning("Upgrade has been detected. Test-check for correct version of initial tarfs(dib_datetime)"
                        "has been disabled - now, we check only that node in AllowedVersions for release")
            is_upgraded = True
        # as for now, we may check only fact that dist in allowed, not exact distr version ot be deployed
        # for example - if we have multiply `ubuntu/focal` - we cant check that it was deployed from EXACT id.
        test_allowed_distr_ids = [d['id'] for d in test_allowed_dists]

        for m in self.cluster.get_machines():
            if m.is_disabled():
                continue

            kaas_exp_distr_id = m.data['spec']['providerSpec']['value'].get("distribution", None)

            if kaas_exp_distr_id is None:
                LOG.error(f"Exact 'distribution' reference not found in spec machine:{m.name}\n"
                          f"This is unexpected for release testing. Must be one of {test_allowed_distr_ids}")
                raise Exception(f'Unexpected distribution spec {kaas_exp_distr_id}')

            if not is_postpone_distribution_upgrade_enabled:
                # Expect exact the same distribution on the node as it is set in the Machine spec
                filtered_allowed_dists = [d for d in test_allowed_dists if d['id'] == kaas_exp_distr_id]
            else:
                # There can be any supported distribution in case if postponed distribution upgrade is enabled
                filtered_allowed_dists = test_allowed_dists

            # check for upgraded case
            # TODO(alexz): looks like there is no way in kaas, to identify machine initial deployment version.
            # so for upgrade check, we need to disable those check.
            # here we check that node has been initially provisioned from expected tarfs.
            # Those info are always initial, even after upgrade\reboot.
            if is_upgraded:
                LOG.debug("TODO(alexz): here(?) should be actual check for repository change after upgrade")

                test_allowed_distr_kernels = [d['kernel'] for d in filtered_allowed_dists]
                test_allowed_distr_os_vers = [d['os_version'] for d in filtered_allowed_dists]
                assert perhostdata[m.name]['kernel'] in test_allowed_distr_kernels, \
                    f"Expected machine=kernel on host os: {m.name} in {test_allowed_distr_kernels}, but have:\n" \
                    f"{perhostdata[m.name]['kernel']}"
                assert perhostdata[m.name]['os_version'] in test_allowed_distr_os_vers, \
                    f"Expected machine=os_version on host os: {m.name} in {test_allowed_distr_os_vers}, but have:\n" \
                    f"{perhostdata[m.name]['os_version']}"
                LOG.warning(f"machine dib_version check skipped due upgrade. "
                            f"Detected on node currently:{perhostdata[m.name]['dib_datetime']}")
                # after si-tests check finished, we may check expected reports from lcm-agent,
                # about actial information from node, but in perspective of lcm-agent. PRODX-22958 might be related.
                host_info = \
                    self.cluster.get_cluster_lcmmachine(name=m.name,
                                                        namespace=self.cluster.namespace).data['status']['hostInfo']
                assert host_info['kernelVersion'] in test_allowed_distr_kernels, \
                    f"Expected machine=kernel on host os: {m.name} in {test_allowed_distr_kernels}," \
                    f"but lcm-agent report:\n {host_info['kernelVersion']}"
            # check for initial deployment, exact check for data 1:1, w\o upgrade
            else:
                test_data_for_exp_dist = [d for d in filtered_allowed_dists
                                          if not d.get('notgreenfield', False) and d['id'] == kaas_exp_distr_id]
                LOG.info(f"For machine:{m.name}:\n"
                         f"KaaS deployed OS-id:{kaas_exp_distr_id}\n"
                         f"Test expected data for distr:{test_data_for_exp_dist}")

                assert len(test_data_for_exp_dist) == 1, \
                    f"{kaas_exp_distr_id} is not found in test expected data for distr"

                test_allowed_distr_kernel = test_data_for_exp_dist[0]['kernel']
                test_allowed_distr_os_ver = test_data_for_exp_dist[0]['os_version']
                test_allowed_distr_dib_ver = test_data_for_exp_dist[0]['dib_version']
                assert perhostdata[m.name]['kernel'] == test_allowed_distr_kernel, \
                    f"Expected machine=kernel on host os: {m.name} == {test_allowed_distr_kernel}, but have:\n" \
                    f"{perhostdata[m.name]['kernel']}"
                assert perhostdata[m.name]['os_version'] == test_allowed_distr_os_ver, \
                    f"Expected machine=os_version on host os: {m.name} == {test_allowed_distr_os_ver}, but have:\n" \
                    f"{perhostdata[m.name]['os_version']}"

                assert perhostdata[m.name]['dib_datetime'] == test_allowed_distr_dib_ver, \
                    f"Expected machine=dib_version on host os: {m.name} == {test_allowed_distr_dib_ver}, but have:\n" \
                    f"{perhostdata[m.name]['dib_datetime']}"
                # after si-tests check finished, we may check expected reports from lcm-agent,
                # about actial information from node, but in perspective of lcm-agent. PRODX-22958 might be related.
                host_info = \
                    self.cluster.get_cluster_lcmmachine(name=m.name,
                                                        namespace=self.cluster.namespace).data['status']['hostInfo']
                assert host_info['kernelVersion'] in test_allowed_distr_kernel, \
                    f"Expected machine=kernel on host os: {m.name} in {test_allowed_distr_kernel}," \
                    f"but lcm-agent report:\n {host_info['kernelVersion']}"

    def check_created_machine(self, machine: "Machine"):
        LOG.info(f'Check newly created machine {machine.name}')

        # Waiting for master node become Ready
        self.cluster.check.wait_machine_status_by_name(machine_name=machine.name,
                                                       expected_status='Ready')

        # Waiting for machines are Ready
        self.cluster.check.check_machines_status(timeout=1800)
        self.cluster.check.check_cluster_nodes()
        self.cluster.check.check_k8s_nodes()

        # Check/wait for correct docker service replicas in cluster
        ucp_worker_agent_name = self.cluster.check.get_ucp_worker_agent_name()
        self.cluster.check.check_actual_expected_docker_services(
            changed_after_upd={'ucp-worker-agent-x': ucp_worker_agent_name})
        self.cluster.check.check_k8s_pods()
        self.cluster.check.check_actual_expected_pods(timeout=3200)
        self.cluster.check.check_cluster_readiness()

    def check_docker_nodes(self):
        swarm_client = self.cluster.get_dockerclient()
        k8s_client = self.cluster.k8sclient
        swarm_node_names = sorted([node["Hostname"] for node in swarm_client.node_ls(exec_pod_cmd=True)])
        LOG.info("Docker swarm node names: {0}".format(swarm_node_names))

        k8s_node_names = sorted([node.name for node in k8s_client.nodes.list_all()])
        LOG.info("Kubernetes node names: {0}".format(k8s_node_names))

        assert set(swarm_node_names) == set(k8s_node_names), (
            "Node names from docker swarm cluster don't match "
            "the node names from k8s cluster:\n"
            "  Docker swarm node names: {0}\n"
            "  K8s node names: {1}"
            .format(swarm_node_names, k8s_node_names))

    def wait_deleted_node(self, node_name: str):
        LOG.info(f"Wait k8s node delete - {node_name}")
        waiters.wait(lambda: node_name not in self.cluster.get_k8s_node_names(),
                     interval=10, timeout=1800)
        LOG.info("Wait docker swarm nodes list to match k8s nodes list")
        waiters.wait_pass(self.check_docker_nodes, interval=30, timeout=1800)

    def check_deleted_node(self, node_name: str):
        self.wait_deleted_node(node_name)

        self.cluster.check.check_machines_status()
        self.cluster.check.check_cluster_nodes()
        self.cluster.check.check_k8s_nodes()

        left_machines = self.cluster.get_machines_names()

        LOG.info("Check number of k8s nodes")
        left_nodes = self.cluster.get_k8s_node_names()

        assert len(left_machines) == len(left_nodes), \
            f"Unequal number of nodes and machines\nnodes: {left_nodes}\nmachines: {left_machines}"

        # PRODX-30294: Pause between node deletion and new node creation, to let MKE complete post-deletion actions
        time.sleep(60)

    def check_deleted_machinepool(self, machinepool_name: str):
        LOG.info(f"Wait for  machinepool deletion - {machinepool_name}")
        try:
            waiters.wait(lambda: machinepool_name not in self.cluster.get_machinepools(),
                         interval=30, timeout=500)
        except ApiException as ex:
            if str(ex.status) == "404" and ex.reason == "Not Found":
                all_pools = self.cluster.get_machinepools()
                assert machinepool_name not in all_pools,\
                    f"Machinepool {machinepool_name} was not deleted and still exists " \
                    f"in cluster {self.cluster.name}. Full list {all_pools}"

    def check_openstack_metadata_deletion(self, node_name: str):
        LOG.info(f"Checking out all the metadata is cleaned up for node {node_name}")
        client_pod = self.cluster.k8sclient.pods.list(namespace="openstack", name_prefix='keystone-client')
        assert len(client_pod) > 0, "No pods found with prefix keystone-client in namespace openstack"
        client_pod = client_pod[0]

        compute_service_list_cmd = ['/bin/sh', '-c', 'PYTHONWARNINGS=ignore::UserWarning '
                                                     'openstack compute service list -f yaml']
        cmp_svc_list = yaml.safe_load(client_pod.exec(compute_service_list_cmd))
        remain_cmp_svc = [n for n in cmp_svc_list if n.get('Host') == node_name]
        compute_service = remain_cmp_svc[0].get('ID') if remain_cmp_svc else None

        volume_service_list_cmd = ['/bin/sh', '-c', 'PYTHONWARNINGS=ignore::UserWarning '
                                                    'openstack volume service list -f yaml']
        vlm_svc_list = yaml.safe_load(client_pod.exec(volume_service_list_cmd))
        remain_vol_svc = [n for n in vlm_svc_list if n.get('Host') == node_name]
        volume_service = remain_vol_svc[0].get('Host') if remain_vol_svc else None

        if not self.cluster.tf_enabled():
            network_agent_list_cmd = ['/bin/sh', '-c', 'PYTHONWARNINGS=ignore::UserWarning '
                                                       'openstack network agent list -f yaml']
            net_svc_list = yaml.safe_load(client_pod.exec(network_agent_list_cmd))
            remain_netagent_svc = [n for n in net_svc_list if n.get('Host') == node_name]
            network_agent = remain_netagent_svc[0].get('ID') if remain_netagent_svc else None
        else:
            LOG.warning("Skip checking network agent list when tf enabled")
            network_agent = None

        assert not compute_service and not volume_service and not network_agent, \
            f"Some openstack metadata still exist, associated with node {node_name}:\n" \
            f"Compute service: {compute_service}\nVolume service: {volume_service}\nNetwork agent list: {network_agent}"
        LOG.info("All metadata has been cleaned up")

    def check_overlay_encryption_functionality(self):
        if not self.cluster.data['spec'].get('providerSpec', {}).get('value', {}).get('secureOverlay', False):
            LOG.info('secureOverlay disabled. Skip encryption checking')
        else:
            self.cluster.check.check_wireguard_enabled_successfully()
            self.cluster.check.check_wireguard_traffic_for_machine()

    def check_ipsec_functionality(self):
        # 1. Check ipsec policy
        # 2. Check ipsec encryption for tunnel traffic
        if not self.cluster.data['spec'].get('providerSpec', {}).get('value', {}).get('secureOverlay', False):
            LOG.info('secureOverlay disabled. Skip ipsec checking')
        else:
            machines = self.cluster.get_machines()
            err_msg = ''
            for machine in machines:
                neighbor_machines_ips = [m.internal_ip for m in machines if m.internal_ip != machine.internal_ip]
                ipsec_policy_cmd = 'sudo ip xfrm policy list|grep " ipencap"|cut -d" " -f4|cut -d"/" -f1'
                tcpdump_cmd = "timeout 6 tcpdump -nei any esp or ah -c300 -q 2>/dev/null"\
                              "|sed 's/  In/ I/g'|cut -d' ' -f4,6|tr -d ':'"
                policy_result = machine.exec_pod_cmd(ipsec_policy_cmd, verbose=False)
                tcpdump_result = machine.exec_pod_cmd(tcpdump_cmd, verbose=False)
                policy_list = policy_result['logs'].strip().split('\n')
                tcpdump_data = tcpdump_result['logs'].strip().split('\n')
                LOG.info(f"Сheck ipsec policy on machine {machine.name}/{machine.internal_ip}")
                if policy_result['exit_code'] != 0:
                    err_msg += f"Ipsec policy check error {self._cluster.name}/{machine.name}:"\
                               f"\n{policy_result['logs']}\n"
                for neighbor_ip in neighbor_machines_ips:
                    if neighbor_ip not in policy_list:
                        err_msg += f"Ipsec disabled for tunnel between {machine.name} and dst tunnel ip {neighbor_ip}\n"
                LOG.info(f"Сheck ipsec traffic on machine {machine.name}/{machine.internal_ip}")
                if tcpdump_result['exit_code'] != 0:
                    err_msg += f"Ipsec traffic check error {self._cluster.name}/{machine.name}:"\
                               f"\n{tcpdump_result['logs']}\n"
                tcpdump_neighbors = []
                if tcpdump_result['logs'] == "\n" or len(tcpdump_data) == 0:
                    err_msg += f"Ipsec traffic not found in 6 second on {machine.name}:\n{tcpdump_result['logs']}\n"
                else:
                    for src_dst in set(tcpdump_data):
                        addresses = src_dst.split(' ')
                        if len(addresses) != 2:
                            LOG.info("Unexpected count of addresses. There must be 2 addresses (src and dst)")
                        else:
                            src, dst = addresses
                            if src == machine.internal_ip:
                                tcpdump_neighbors.append(dst)
                            else:
                                tcpdump_neighbors.append(src)
                for neighbor_ip in neighbor_machines_ips:
                    if neighbor_ip not in tcpdump_neighbors:
                        err_msg += f"Ipsec traffic is not detected between src machine {machine.name} "\
                                   f"and dst machine ip {neighbor_ip}\n"
            assert err_msg == '', err_msg

    def check_wireguard_enabled_successfully(self):
        # 1. Check interface is present
        # 2. Check wireguard public key presence in calico node description
        err_msg = ''
        machines = self.cluster.get_machines()
        check_int_cmd = 'sudo ip a show wireguard.cali'
        for machine in machines:
            LOG.info(f"Сheck wireguard interface is present on machine {machine.name}/{machine.internal_ip}")
            interface_res = machine.exec_pod_cmd(check_int_cmd, verbose=False)
            if interface_res['exit_code'] != 0:
                err_msg += f"Calico Wireguard interface is not present {self._cluster.name}/{machine.name}:"\
                           f"\n{interface_res['logs']}\n"
            get_calico_node_cmd = f"calicoctl --allow-version-mismatch get node {machine.get_k8s_node_name()} " \
                                  f"-o yaml"
            calico_node_res = machine.exec_pod_cmd(get_calico_node_cmd, verbose=False, pod_type='calico')
            LOG.info(f"Сheck wireguard pubKey is present in calico node description "
                     f"on machine {machine.name}/{machine.internal_ip}")
            if calico_node_res['exit_code'] != 0:
                err_msg += f"Could not get calico node description on machine {machine.name}/{machine.internal_ip}"\
                           f"\n{calico_node_res['logs']}\n"
            if 'wireguardPublicKey' not in calico_node_res['logs']:
                err_msg += f"Wireguard public key does not found in calico node on machine " \
                           f"{machine.name}/{machine.internal_ip}" \
                           f"\n{calico_node_res['logs']}\n"
        assert err_msg == '', err_msg

    def check_wireguard_traffic_for_machine(self):
        # - Get every calico-node pods
        # - Collect calico configuration (detect interface and ip for tcpdump)
        # - Tcpdump wireguard traffic and parse
        def get_interface_by_ip(ip='', from_machine=None):
            host_iface_by_ip_cmd = f"ifconfig|grep -B1 'inet {ip}'|" + \
                                   "awk '$1!=\"inet\" && $1!=\"--\" {print $1}'|tr -d \":\""
            iface_cmd_b64 = base64.b64encode(host_iface_by_ip_cmd.encode('ascii')).decode('ascii')
            host_iface_cmd_res = from_machine.exec_pod_cmd(f'eval $(echo {iface_cmd_b64}|base64 -d)', verbose=False)
            return host_iface_cmd_res['logs'].strip()

        def get_ip_by_interface(interface='', from_machine=None):
            host_ip_by_iface_cmd = f"ifconfig {interface}|grep 'inet '|" +\
                                   "awk '{print $2}'"
            ip_cmd_b64 = base64.b64encode(host_ip_by_iface_cmd.encode('ascii')).decode('ascii')
            host_ip_cmd_res = from_machine.exec_pod_cmd(f'eval $(echo {ip_cmd_b64}|base64 -d)', verbose=False)
            return host_ip_cmd_res['logs'].strip()

        calico_nodes_pods = self.cluster.k8sclient.pods.list('kube-system', 'calico-node-')
        calico_node_data = {}

        # Collect calico configuration loop
        for calico_pod in calico_nodes_pods:
            node_name = calico_pod.data['spec']['node_name']
            machine = self.cluster.get_machine_by_k8s_name(node_name)
            calico_node_container = [c for c in calico_pod.read().spec.containers if c.name == 'calico-node'][0]
            calico_node_data[node_name] = {
                'iface_source': '',
                'iface': '',
                'ip': '',
                'neighbors_ip': [],
            }
            for container_env_var in calico_node_container.to_dict()['env']:
                if container_env_var['name'] == 'IP' and container_env_var['value_from']:
                    calico_node_data[node_name]['iface_source'] = \
                        container_env_var['value_from']['field_ref']['field_path']
                elif container_env_var['name'] == 'IP' and container_env_var.get('value') == 'autodetect':
                    calico_node_data[node_name]['iface_source'] = 'autodetect'
                if container_env_var['name'] == 'IP_AUTODETECTION_METHOD' and container_env_var.get('value'):
                    calico_node_data[node_name]['iface'] = container_env_var['value'].split('=')[1]
            if calico_node_data[node_name]['iface'] and calico_node_data[node_name]['iface_source'] == 'autodetect':
                calico_node_data[node_name]['ip'] = get_ip_by_interface(calico_node_data[node_name]['iface'], machine)
                LOG.info(f"Calico tunnel for {machine.name} configured over "
                         f"{calico_node_data[node_name]['iface']} / {calico_node_data[node_name]['ip']}")
            elif 'hostIP' in calico_node_data[node_name]['iface_source']:
                calico_node_data[node_name]['ip'] = calico_pod.data['status']['host_ip']
                calico_node_data[node_name]['iface'] = get_interface_by_ip(calico_node_data[node_name]['ip'], machine)
                LOG.info(f"Calico tunnel for {machine.name} configured over "
                         f"{calico_node_data[node_name]['iface']} / {calico_node_data[node_name]['ip']}")
            else:
                raise Exception("Test can't detect calico tunnel interface or detection method")
        assert len(calico_node_data) == len(calico_nodes_pods), "Missing collected configuration for some nodes"

        # Loop for tcpdump
        for m in self.cluster.get_machines():
            err_msg = ''
            test_timeout = '30'
            node_name = m.get_k8s_node_name()
            calico_node_data[node_name]['neighbors_ip'] = \
                [v['ip'] for k, v in calico_node_data.items() if calico_node_data[node_name]['ip'] != v['ip']]

            tcpdump_cmd = f"timeout {test_timeout} tcpdump -nei {calico_node_data[node_name]['iface']} port 51820 " +\
                          '-c3000 -q 2>/dev/null | sed -r \"s/.* ([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+).* > ' +\
                          '([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+).*/\\1 > \\2/\"|sed \"s/ >//\"'
            tcpdump_cmd_b64 = base64.b64encode(tcpdump_cmd.encode('ascii')).decode('ascii')
            tcpdump_result = m.exec_pod_cmd(f'eval $(echo {tcpdump_cmd_b64}|base64 -d)', verbose=False)
            tcpdump_data = tcpdump_result['logs'].strip().split('\n')
            LOG.info(f"Сheck wireguard traffic on machine {m.name} / {calico_node_data[node_name]['ip']};"
                     f" Calico config: {calico_node_data[node_name]}")
            if tcpdump_result['exit_code'] != 0:
                err_msg += f"Wireguard traffic check error {self._cluster.name} / {m.name}:"\
                           f"{tcpdump_result['logs'].strip()}\n"
            tcpdump_neighbors = []
            if tcpdump_result['logs'] == "\n" or not tcpdump_data:
                err_msg += f"Wireguard traffic not found in {test_timeout} second on {m.name}:" \
                           f"\n{tcpdump_result['logs'].strip()}\n"
            else:
                for src_dst in set(tcpdump_data):
                    addresses = src_dst.split(' ')
                    if len(addresses) != 2:
                        LOG.info("Unexpected count of addresses. There must be 2 addresses (src and dst)")
                    else:
                        src, dst = addresses
                        if src == calico_node_data[node_name]['ip']:
                            tcpdump_neighbors.append(dst)
                        else:
                            tcpdump_neighbors.append(src)
            if tcpdump_neighbors.sort() != calico_node_data[node_name]['neighbors_ip'].sort():
                err_msg += f"Wireguard encrypted traffic missmatch for {m.name}:\n" + \
                           f" - Expected neighbors {calico_node_data[node_name]['neighbors_ip'].sort()}\n" + \
                           f" - Actual neighbors {tcpdump_neighbors.sort()}\n"
            assert err_msg == '', err_msg

    def check_expected_actual_reboot_required_status(self):
        LOG.info("Compare reboot required status between KaaS machines and hosts")
        machines = self.cluster.get_machines()
        failed_machines = {'failed_reboot_status': {}, 'failed_packages_reason': {}}
        for machine in machines:
            if machine.is_disabled():
                LOG.info(f"Machine {machine.name} is disabled. Skipping")
                continue
            m_name = machine.name
            actual_reboot_required_status_from_machine = machine.is_reboot_required()
            expected_reboot_required_status_from_host = machine.reboot_required_status_from_host()
            actual_status_from_machine = actual_reboot_required_status_from_machine[0]
            expected_status_from_host = expected_reboot_required_status_from_host[0]
            actual_packges_reason = actual_reboot_required_status_from_machine[1]
            expected_packges_reason = expected_reboot_required_status_from_host[1]
            LOG.info(f"Current reboot required status for machine {m_name} is "
                     f"{actual_status_from_machine}, expected: {expected_status_from_host}. Actual packages reason: "
                     f"{actual_packges_reason}, expected: {expected_packges_reason}")
            if actual_status_from_machine != expected_status_from_host:
                failed_machines['failed_reboot_status'].update(
                    {m_name: {'expected_reboot_required_status': expected_status_from_host,
                              'actual_reboot_required_status': actual_status_from_machine}})

            if not set(expected_packges_reason).issubset(set(actual_packges_reason)):
                failed_machines['failed_packages_reason'].update(
                    {m_name: {'expected_packages_reason': expected_packges_reason,
                              'actual_packages_reason': actual_packges_reason}})
        fail_message = ""
        failed_reboot_status = failed_machines['failed_reboot_status']
        failed_packages_reason = failed_machines['failed_packages_reason']
        if failed_reboot_status:
            fail_message += (f"\n\nReboot status mismatch for next machines:\n"
                             f"{yaml.dump(failed_reboot_status)}")
        if failed_packages_reason:
            fail_message += (f"\nPackages status mismatch for next machines:\n"
                             f"{yaml.dump(failed_packages_reason)}")

        assert not fail_message, fail_message
        LOG.info("Reboot required status and reason packages for all machines as expected")

    def check_connection_via_bastion(self, target_machine: "Machine"):
        """
        Check connection to target machine internal IP via cluster bastion host
        Args:
            target_machine (Machine): target machine object
        Returns: None

        """
        expected_name = target_machine.get_k8s_node_name()
        destination_ip = target_machine.internal_ip

        LOG.info(f"Trying to connect to node {expected_name} with IP {destination_ip} "
                 f"via bastion host with IP {self.cluster.bastion_ip}")
        keys = utils.load_keyfile(self.cluster.private_key)
        key = utils.get_rsa_key(keys['private'])
        auth = exec_helpers.SSHAuth(username=settings.KAAS_BASTION_SSH_LOGIN,
                                    password='', key=key)
        ssh = self.cluster.get_bastion_remote()
        result = ssh.execute_through_host(destination_ip, "hostname", auth=auth)
        assert result.stdout_str == expected_name

    def check_os_vm(self, openstack_client, image, flavor, az_name,
                    metadata=None, volume_size=None, boot_from_volume=False, vm_name=None, vm_id=None):
        """
        Check OpenStack VM parameters
        Args:
            openstack_client: OpenStack client
            vm_name: OpenStack instance name (instead of vm_id)
            vm_id: OpenStack instance id (instead of vm_name)
            image: OpenStack image name
            flavor: OpenStack flavor name
            az_name: Availability Zone name
            metadata: expected metadata
            boot_from_volume: allowed boot from volume flag
            volume_size: expected volume size

        Returns: None

        """
        LOG.info("Check OpenStack VM parameters")
        if vm_name:
            openstack_machine = openstack_client.get_server_by_name(vm_name).to_dict()
        elif vm_id:
            openstack_machine = openstack_client.get_server_by_id(vm_id).to_dict()
        else:
            raise Exception("vm_name or vm_id should be specified")
        os_metadata = openstack_machine.get('metadata')
        os_flavor_id = openstack_machine.get('flavor', {}).get('id')
        os_flavor = openstack_client.nova.flavors.get(flavor=os_flavor_id)
        os_image = openstack_machine.get('image')
        os_az_name = openstack_machine.get('OS-EXT-AZ:availability_zone')
        assert os_flavor.name == flavor, "VM is created from incorrect flavor"
        assert os_az_name == az_name
        # check VM metadata
        error_keys = {}
        if metadata:
            for key, value in metadata.items():
                if (v := os_metadata.get(key)) and v == value:
                    pass
                else:
                    error_keys[key] = value
        assert not error_keys, "The following metadata has not been populated {0}".format(error_keys)

        if boot_from_volume:
            LOG.info("Check OpenStack VM boot from volume parameters")
            os_attached_volume = openstack_machine.get('os-extended-volumes:volumes_attached')
            os_volume = os_attached_volume[0].get('id')
            cinder_volume = openstack_client.cinder.volumes.get(volume_id=os_volume)
            cinder_image_name = cinder_volume.volume_image_metadata.get('image_name')
            assert os_image == '', 'Should be empty if booted from volume'
            assert len(os_attached_volume) == 1, "Only one attached volume should be exist"
            assert cinder_volume.size == volume_size, "Volume size is incorrect"
            assert cinder_volume.bootable == 'true', "Volume should be bootable for boot from volume feature"
            assert cinder_image_name == image, "Volume created from incorrect image"
        else:
            # VM image parameter exist only if deploy without boot from volume option
            image_id = os_image.get('id')
            glance_image = openstack_client.nova.glance.find_image(image_id)
            assert glance_image.name == image, "VM is created from incorrect image"

    def check_aws_vm(self, instance_id, instance_type, ami_id):
        """
        Check AWS VM parameters

        Args:
            instance_id: AWS instance ID
            instance_type: Type of instance used to launch the bastion instance.
            ami_id: AMI ID used to launch the bastion instance.

        Returns: None

        """
        LOG.info(f"Check AWS VM parameters for Instance ID = {instance_id}")
        aws_client = self.cluster.provider_resources.client
        aws_instance = aws_client.get_instance(instance_id=instance_id)
        assert instance_type == aws_instance.instance_type, "VM is created from incorrect instance type"
        assert ami_id == aws_instance.image.image_id, "VM is created from incorrect AMI"

    def check_deploy_stage_success(self, skipped_stages_names=None):
        skipped_stages = []
        if skipped_stages_names and type(skipped_stages_names) is str:
            skipped_stages.append(skipped_stages_names)
        elif skipped_stages_names and type(skipped_stages_names) is list:
            skipped_stages.extend(skipped_stages_names)
        LOG.info(f"Check stages in crd clusterdeploy for cluster {self.cluster.name}")
        failed_stages = []

        def _inner():
            nonlocal failed_stages
            failed_stages = []
            stages = self.cluster.get_cluster_deploy_stages_filtered_by_cluster_name(self.cluster.name)
            if stages:
                for stage in stages:
                    stage_name = stage.get('name', '')
                    timestamp = stage.get('timestamp', '')
                    message = stage.get('message', '')
                    success = stage.get('success', '')
                    if not success:
                        if stage_name not in skipped_stages:
                            failed_stages.append(f"Cluster: {self.cluster.name}; "
                                                 f"Failed stage: {stage_name}; "
                                                 f"Success: {success}; "
                                                 f"Message: {message}; "
                                                 f"Time of last retry: {timestamp} ")
            return len(failed_stages) == 0

        try:
            waiters.wait(_inner)
        except exceptions.TimeoutError:
            pass
        assert len(failed_stages) == 0, f'Next stages are failed {failed_stages}'

    def check_upgrade_stage_success(self, skipped_stages_names=None):
        skipped_stages = []
        if skipped_stages_names:
            skipped_stages.append(skipped_stages_names)
        cluster_release = self.cluster.clusterrelease_version
        LOG.info(f"Cluster release: {cluster_release}")
        failed_stages = []
        LOG.info(f"Check stages in crd clusterupgradehistroy "
                 f"for cluster {self.cluster.name}")
        stages = self.cluster.get_cluster_upgrade_stages_filtered_by_cluster_name(self.cluster.name)
        if stages:
            for stage in stages:
                stage_name = stage.get('name', '')
                timestamp = stage.get('timestamp', '')
                message = stage.get('message', '')
                success = stage.get('success', '')
                status = stage.get('status', '')
                if not success and status != 'NotStarted':
                    if stage_name not in skipped_stages:
                        failed_stages.append(f"Cluster: {self.cluster.name}; "
                                             f"Failed stage: {stage_name}; "
                                             f"Status: {status}; "
                                             f"Success: {success}; "
                                             f"Message: {message}; "
                                             f"Time of last retry: {timestamp} ")
            assert len(failed_stages) == 0, f'Next stages are failed {failed_stages}'

    def check_upgraded_machines_cordon_drain_stages(self, status_matches=None,
                                                    skip_maintenance=False, reboot_expected=None):
        """
        For patch releases we expect that the cordon drain stages did not occur.

        All statuses here: https://gerrit.mcp.mirantis.com/plugins/gitiles/kaas/core/+/
        refs/heads/master/vendor/github.com/Mirantis/kaas-api/v2/pkg/apis/lcm/v1alpha1/lcmstatus_types.go

        Args:
            status_matches = [] - List of matches to check in statuses
        """
        LOG.info("Check that cordon/drain stages after the nodes upgrade did not run for patchreleases, "
                 "except distribution upgrade")
        reboot_expected = reboot_expected or {}
        if not status_matches:
            status_matches = ['Kubernetes drained']
        elif status_matches and isinstance(status_matches, str):
            status_matches = [status_matches]
        stages_present = []
        stages_absent = []
        actual_version = self.cluster.clusterrelease_actual_version
        to_release = actual_version.replace('+', '-')
        for m in self.cluster.get_machines():
            LOG.info(f"Check stages in crd machineupgradehistroy for machine {m.name}")
            stages = self.cluster.get_machine_upgrade_stages(machine_name=f"{m.name}-{to_release}",
                                                             namespace=self.cluster.namespace)
            if stages:
                for match in status_matches:
                    match_found = False
                    for stage in stages:
                        stage_name = stage.get('name', '')
                        timestamp = stage.get('timestamp', '')
                        message = stage.get('message', '')
                        status = stage.get('status', '')
                        success = stage.get('success', '')
                        if reboot_expected.get(m.name):
                            # Reboot is expected for the Machine, probably because of distribution change
                            # during a patchrelease upgrade
                            if match in stage_name:
                                match_found = True
                                if not success and status == 'NotStarted':
                                    # Stage should be completed, but it was not started
                                    stages_absent.append(f"Cluster: {self.cluster.name}; "
                                                         f"Machine: {m.name}; "
                                                         f"Stage: {stage_name}; "
                                                         f"Success: {success}; "
                                                         f"Status: {status};"
                                                         f"Message: {message}; "
                                                         f"Time of last retry: {timestamp} ")
                        else:
                            # Reboot is not expected for the Machine, should be no drains (except for mgmt cluster)
                            if match in stage_name and not self.cluster.is_management:
                                if not success and status == 'NotStarted' and skip_maintenance:
                                    LOG.info(f'Next stages are present: {stages_present}, but have status '
                                             f'NotStarted which is expected behaviour with SkipMaintenance flag')
                                    continue
                                stages_present.append(f"Cluster: {self.cluster.name}; "
                                                      f"Machine: {m.name}; "
                                                      f"Stage: {stage_name}; "
                                                      f"Success: {success}; "
                                                      f"Status: {status};"
                                                      f"Message: {message}; "
                                                      f"Time of last retry: {timestamp} ")
                    if reboot_expected.get(m.name) and not match_found and not skip_maintenance:
                        stages_absent.append(f"Cluster: {self.cluster.name}; "
                                             f"Machine: {m.name}; "
                                             f"Stage '{match}' is missing, but expected as cluster maintenance "
                                             f"is not skipped and machine has been rebooted")
        assert len(stages_present) == 0, f'Next stages are present, but should not: {stages_present}'
        assert len(stages_absent) == 0, f'Next stages are absent or not completed, but should be: {stages_absent}'

    def wait_leader_migrate_from_node(self, node_name, timeout=600):
        """
        Wait until the leader migrates from the node during the election process
        Args:
            node_name: target node that pods should migrate from
            timeout: wait timeout in seconds

        Returns: None

        """
        pods_on_node = self.cluster.get_pods_for_node(node_name=node_name)
        pods_name = [p.name for p in pods_on_node]

        def _get_data():
            leader: dict = self.cluster.get_leader_pods()
            filtered = {k: v for k, v in leader.items() if v in pods_name}
            LOG.info(f"Next leader pods still not changed: {filtered}")
            return False if filtered else True

        LOG.info(f"Wait until leaders migrate from node {node_name}")
        waiters.wait(_get_data, interval=30, timeout=timeout)

    def check_audit(self):
        """
        Scenario:
            1. Check kubernetes audit enabled in MKE config
            2. Get machines where auditd is running.
               NOTE: All machines of a cluster should have auditd
                     running if it was enabled for a cluster.
               Use `exec_pod_cmd` to issue check commands.
            3. Check if auditd service is running.
            4. Check if auditd preset rules are present.
            5. Check if auditd custom rules are present.
            6. Check if auditd log rotation crontask is installed.
               If it is, try to execute it and print the list of log
               files before and after the rotation.

        Note: Some checks did not verify correctness of the auditd
              state, e.g. if log rotation crontask doesn't exist it
              is not considered as a failure. This is due to
              optional nature of such states, and due to it's not possible
              to verify them without getting the desired state from the
              cluster object, what is an overkill now.
        """
        child_version = self.cluster.clusterrelease_version
        mos_17 = "mosk-17-1-0-rc-24-1"
        mke_16 = "mke-16-1-0-rc-3-7-4"
        mgmt_cluster = self.cluster._manager.get_mgmt_cluster()
        kaas_version = version.parse(mgmt_cluster.get_kaasrelease_version())
        check_enabled = False
        if self.cluster.is_child and kaas_version >= version.parse("kaas-2-26-0-rc"):
            if child_version.startswith('mos') and version.parse(child_version) >= version.parse(mos_17):
                check_enabled = True
            if child_version.startswith('mke') and version.parse(child_version) >= version.parse(mke_16):
                check_enabled = True
            if check_enabled:
                mke_client = self.cluster.get_mke_dashboardclient()
                mke_config = toml.loads(mke_client.get_mke_config())
                LOG.warning(f"audit config: {mke_config}")
                assert mke_config["audit_log_configuration"]["level"] == "metadata" and \
                    mke_config["cluster_config"]["kube_api_server_auditing"] is True, \
                    f"kubernetes audit is not enabled on cluster {self.cluster.name}"
                assert mke_config["cluster_config"][
                           "kube_api_server_profiling_enabled"] == settings.K8S_PROFILING_ENABLE,\
                    f"kube_api_server_profiling_enabled is not set in mke config, " \
                    f"expected: {settings.K8S_PROFILING_ENABLE}, " \
                    f"actual {mke_config['cluster_config']}"
                assert mke_config["cluster_config"][
                           "kube_controller_manager_profiling_enabled"] == settings.K8S_PROFILING_ENABLE, \
                    f"kube_controller_manager_profiling_enabled is not set in mke config, " \
                    f"expected: {settings.K8S_PROFILING_ENABLE}, " \
                    f"actual {mke_config['cluster_config']}"
                assert mke_config["cluster_config"][
                           "kube_scheduler_profiling_enabled"] == settings.K8S_PROFILING_ENABLE, \
                    f"kube_scheduler_profiling_enabled is not set in mke config, " \
                    f"expected: {settings.K8S_PROFILING_ENABLE}, " \
                    f"actual {mke_config['cluster_config']}"
        cluster_auditd_enable = self.cluster.data['spec'].get('providerSpec', {}).get(
            'value', {}).get('audit', {}).get('auditd', {}).get('enabled', False)
        if not cluster_auditd_enable and not settings.AUDITD_ENABLE:
            LOG.info('Auditd disabled. Skip checking')
        elif not cluster_auditd_enable and settings.AUDITD_ENABLE:
            LOG.warning('Auditd flag exists but is not present in cluster object. Skip Checking')
        else:
            machines = self.cluster.get_machines()
            for machine in machines:
                LOG.info(f"Accessing {machine.name}")

                LOG.info("Checking auditd.service ...")
                cmd = machine.exec_pod_cmd("systemctl is-active auditd.service")
                assert cmd['exit_code'] == 0, \
                    f"auditd.service is not active on {machine.name}:\n{cmd['logs']}"

                LOG.info("Looking for auditd preset rules ...")
                cmd = machine.exec_pod_cmd("ls -1 /etc/audit/rules.d")
                LOG.info(f"Found the following auditd rules: {cmd['logs']}")

                LOG.info("Looking for auditd custom rules ...")
                custom_rules_files = [
                    '60-custom.rules',
                    '60-custom-x32.rules',
                    '60-custom-x64.rules'
                ]
                for filename in custom_rules_files:
                    cmd = machine.exec_pod_cmd(f"cat /etc/audit/rules.d/{filename}")

                    if cmd['exit_code'] == 0:
                        LOG.info(f"Custom rules file '{filename}' contains:\n{cmd['log']}")
                    elif cmd['exit_code'] == 1:
                        LOG.info(f"Seems that file '{filename}' does not exist.")
                        continue
                    else:
                        LOG.error(f"Unexpected error '{cmd['exit_code']}'")

                LOG.info("Checking auditd log rotation ...")
                cmd = machine.exec_pod_cmd("test -f /etc/cron.daily/auditd.cron")

                if cmd['exit_code'] == 0:
                    LOG.info("Auditd crontask installed.")
                    auditd_cron_exists = True
                elif cmd['exit_code'] == 1:
                    LOG.info("Auditd crontask not installed.")
                    auditd_cron_exists = False
                else:
                    raise Exception(f"Unexpected exit code '{cmd['exit_code']}'")

                if auditd_cron_exists:
                    cmd = machine.exec_pod_cmd("ls -1 /var/log/audit")
                    LOG.info(f"auditd log files before rotation:\n{cmd['logs']}")

                    cmd = machine.exec_pod_cmd("sh /etc/cron.daily/auditd.cron")
                    assert cmd['exit_code'] == 0, \
                        f"Failed to execute auditd.cron on {machine.name}:\n{cmd['logs']}"

                    cmd = machine.exec_pod_cmd("ls -1 /var/log/audit")
                    LOG.info(f"auditd log files after rotation:\n{cmd['logs']}")

    def create_pvc(self, ns: str = None, pvc_name: str = None, pvc_size: int = 1,
                   storage_class: str = "kubernetes-hdd", access_mode: str = 'ReadWriteOnce'):
        """
        Create PVC
        Args:
            ns: namespace for create or random
            pvc_name: PVC name or random
            pvc_size: PVC size in gigabytes
            storage_class: PVC storage class name
            access_mode: PVC access mode

        Returns: K8sPersistentVolumeClaim object

        """
        rnd = utils.gen_random_string(6)
        namespace = ns or f'test-ns-{rnd}'
        claim_name = pvc_name or f"test-pv-claim-{rnd}"
        pvc_template = templates.render_template(
            settings.CEPH_PVC_YAML, {
                'PVC_NAME': claim_name,
                'PVC_SIZE': f'{pvc_size}Gi',
                'ACCESS_MODE': access_mode,
                'STORAGE_CLASS_NAME': storage_class})
        pvc_json_body = json.dumps(
            yaml.load(pvc_template, Loader=yaml.SafeLoader))

        LOG.info('Creating pvc')
        pvc = self.cluster.k8sclient.pvolumeclaims.create(name=claim_name, namespace=namespace,
                                                          body=json.loads(pvc_json_body))
        self.cluster.check.wait_ceph_pvc_status(pvc_name=claim_name, namespace=namespace, expected_status='Bound',
                                                timeout=900)
        return pvc

    def create_pod(self, pvc_name: str, name: str = None, node_name: str = None, mount_path: str = None,
                   ns: str = None, base_image_repo: str = None):
        """
        Create test nginx pod
        Args:
            pvc_name: PVC name for attach
            name: pod name
            node_name: node name where pod located
            mount_path: filesystem mount path
            ns: namespace for create or random
            base_image_repo: docker repository address

        Returns: K8sPod object

        """
        rnd = utils.gen_random_string(6)
        namespace = ns or f'test-ns-{rnd}'
        pod_name = name or f"test-pod-{rnd}"
        mount_path = mount_path or '/data/test'
        base_image_repo = base_image_repo or self.cluster.determine_mcp_docker_registry()
        node_name = node_name or self.cluster.get_machines(machine_type='worker')[0].get_k8s_node_name()
        pod_template = templates.render_template(
            settings.CEPH_PV_CHECK_POD_YAML,
            {'POD_NAME': pod_name,
             'PVC_NAME': pvc_name,
             'MOUNT_PATH': mount_path,
             'NODE_LABEL': node_name,
             'CEPH_NGINX_IMAGE_BASE_REPO': base_image_repo})
        pod_json_body = json.dumps(
            yaml.load(pod_template, Loader=yaml.SafeLoader))

        LOG.info(f'Create pod on {node_name} node')
        pod = self.cluster.k8sclient.pods.create(name=pod_name, namespace=namespace,
                                                 body=json.loads(pod_json_body))
        LOG.info(f'Waiting for pod {pod.name} is Running.')
        pod.wait_phase(phases='Running', timeout=30 * 60, interval=30)
        return pod

    def check_pod_filesystem(self, mount_path: str, pod):
        """
        Simple check i/o for container filesystem
          - create file
          - check existing file
          - write to file
          - read form file
        Args:
            mount_path: filesystem mount path
            pod: K8sPod object

        Returns:

        """
        LOG.info(f"Create new file in attached pv of {pod.name} pod")
        file_path = f'{mount_path}/{utils.gen_random_string(4)}_test.txt'
        expected_string = utils.gen_random_string(10)
        pod.exec(
            ['/bin/sh', '-c',
             f'echo "{expected_string}" > {file_path}'])
        LOG.info(f"Check '{file_path}' file exist")
        pod.check_file_exists(file_path)
        LOG.info(f"Check '{file_path}' file content")
        res = pod.exec(['/bin/sh', '-c', f'cat {file_path}'])
        assert res.rstrip() == expected_string, \
            (f"Expected file in pod {pod.name} has unexpected content: "
             f"expected: '{expected_string}', actual: {res.rstrip()}")

    def check_custom_hostnames_on_machines(self, machines=None):
        """Ensure that machines hostnames are set according to the customHostnamesEnabled flag

        Please keep in mind that customHostnamesEnabled takes effect only for the newly created machines.
        Some machines could be created with customHostnamesEnabled == False, some with == True.
        So please call this method with "machines" argumeng, which is the list of the machines
        created in the *same* test (with the same value in customHostnamesEnabled), if possible.
        """
        if not machines:
            machines = self.cluster.get_machines()
        flag = self.cluster.get_custom_hostnames_enabled()
        err_msg = ""
        for machine in machines:
            machine_name = machine.name
            node_name = machine.get_k8s_node_name()
            if flag:
                if not (machine_name == node_name):
                    err_msg += f"Machine '{machine_name}' name does not match it's hostname {node_name}.  "
            else:
                if (machine_name == node_name):
                    err_msg += (f"Machine '{machine_name}' name match it's hostname {node_name}, "
                                f"although it shoild not.  ")
        assert not err_msg, (f"Flag customHostnamesEnabled is {flag}, "
                             f"but some machines were created with wrong hostnames: {err_msg}")
        LOG.info(f"Flag 'customHostnamesEnabled' is '{flag}', hostnames are in expected state for the Machines "
                 f"{[m.name for m in machines]}")

    def check_inplace_distribution_upgrade_completed(self, target_distro):
        LOG.info("Check if all machines have the latest allowed OS distribution")
        updated_machines, not_updated_machines = self.cluster.get_machines_by_distribution(target_distro)
        msg = (f"In-place upgrade to distribution '{target_distro}' "
               f"was completed for {len(updated_machines)} Machines\n")
        if not_updated_machines:
            msg += (f"The following Machines were not been upgraded to the distribution '{target_distro}':" +
                    "\n".join([f"{m.name}: {m.get_distribution()}" for m in not_updated_machines]))
            raise Exception(msg)
        LOG.info(msg)

    def check_inplace_distribution_upgrade_not_started(self):
        """Check that there are not upgraded Machine distributions if postponeDistributionUpdate was set

        Raise Exception in case if there is no Machines that wait for distribution upgrade.
        """
        target_release_name = self.cluster.clusterrelease_version
        target_distro = self.cluster._manager.get_latest_available_clusterrelease_distro(self.cluster,
                                                                                         target_release_name)

        LOG.info("Check that Machines distribution upgrade was not executed "
                 "while postponeDistributionUpdate is enabled")
        updated_machines, not_updated_machines = self.cluster.get_machines_by_distribution(target_distro)

        msg = (f"Distribution '{target_distro}' installed on "
               f"{len(updated_machines)}/{len(updated_machines) + len(not_updated_machines)} Machines\n")
        if not not_updated_machines:
            msg += ("All Machines already upgraded to the expected distribution, while it should not be upgraded "
                    "until postponeDistributionUpdate=True")
            raise Exception(msg)
        LOG.info(msg)

    def check_machines_reboot(self, uptimes_before, uptimes_after, reboot_expected=None, machines_to_ignore=None):
        """For each Machine, check that uptime after later than uptime before for each machine

        Reboot is expected for a Machine if 'reboot_expected[machine.name]' is True

        :param uptimes_before: dict, keys are machine names, values are machines uptime before test (datetime object)
        :param uptimes_after: dict, keys are machine names, values are machines uptime after test (datetime object)
        :param reboot_expected: dict, keys are machine names, values are boolean
        :param machines_to_ignore: list of machine names to ignore
        """
        reboot_expected = reboot_expected or {}
        machines_to_ignore = machines_to_ignore or []

        reboots_msg = ""
        for machine_name, uptime_before in uptimes_before.items():
            if machine_name in machines_to_ignore:
                LOG.debug(f"Machine '{machine_name}' is in ignore list, skipping reboot check")
                continue
            # Reboot is expected if a distribution was changed during upgrade,
            # or if reboot_required flag was set for the cluster version
            # in the kaasrelease 'supportedClusterReleases'
            expect_machine_reboot = reboot_expected.get(machine_name)

            # It is possible to have 1-second difference between uptime before and after upgrade
            # due to rounding float numbers in calculations.
            time_delta = abs((uptimes_after[machine_name] - uptime_before).total_seconds())
            LOG.debug(f"Time difference between uptime before and after upgrade = {time_delta}")

            if expect_machine_reboot:
                # Machine should be upgraded and rebooted
                if time_delta < 2:
                    reboots_msg += (
                        f"Machine '{machine_name}' should be rebooted, but machine uptime is still the same: "
                        f" uptime before='{uptime_before}', uptime after='{uptimes_after[machine_name]}'"
                        f" (must be changed).\n")
            else:
                if time_delta > 1:
                    reboots_msg += (
                        f"Machine '{machine_name}' should NOT be rebooted, but machine uptime has been changed: "
                        f" uptime before='{uptime_before}', uptime after='{uptimes_after[machine_name]}'"
                        f" (must be unchanged).\n")

        assert not reboots_msg, f"Unexpected reboots found for the following Machines:\n{reboots_msg}"

    def check_etcd_storage_quota_negative(self):
        """
        1. Try to decrease value (forbidden)
        2. Try to set negative value (forbidden)
        """
        current_quota = self.cluster.etcd_storage_quota
        data = re.split('([a-z]+)', current_quota, flags=re.IGNORECASE)
        size = float(data[0])
        unit = data[1]
        decreased_size = size / 2
        new_quota = str(decreased_size) + unit
        quota_negative = str(-size) + unit
        LOG.info(f"Trying to decrease etcd storage quota. Current value: {current_quota}. "
                 f"Trying to set: {new_quota}")
        try:
            self.cluster.set_etcd_storage_quota(quota_size=new_quota)
        except ApiException as e:
            message = yaml.safe_load(e.body).get('message', '')
            if e.reason == 'Bad Request' and 'admission webhook' in message:
                LOG.info(f"Cannot decrease quota. This is expected. {message}")
            else:
                raise e
        else:
            raise Exception("Should be not possible to decrease value, but request was succesfully applied")

        LOG.info(f"Trying to set negative value for etcd storage quota. Current value: {current_quota}. "
                 f"Trying to set: {quota_negative}")
        try:
            self.cluster.set_etcd_storage_quota(quota_size=quota_negative)
        except ApiException as e:
            message = yaml.safe_load(e.body).get('message', '')
            if e.reason == 'Bad Request' and 'admission webhook' in message:
                LOG.info(f"Cannot set negative quota. This is expected. {message}")
            else:
                raise e
        else:
            raise Exception("Should be not possible to set negative value, but request was succesfully applied")

    def check_etcd_quota_applied(self):
        machines_quotas_map = {}
        cluster_spec_quota = self.cluster.etcd_storage_quota
        cluster_spec_quota_in_bytes = utils.convert_to_bytes(cluster_spec_quota)
        machines = self.cluster.get_machines(machine_type='control')
        for machine in machines:
            cont_id_cmd = "docker ps --format '{{.ID}} {{.Image}}' | grep ucp-etcd | awk '{print $1}'"
            ucp_etcd_cont_id = machine.run_cmd(cont_id_cmd).stdout_str
            inspect_cmd = f"docker inspect {ucp_etcd_cont_id}"
            try:
                container_args = yaml.safe_load(
                    machine.run_cmd(inspect_cmd, verbose=False).stdout_str)[0].get('Args', [])
            except Exception as e:
                if 'No such object' in e.args[0]:
                    LOG.info("Container is being restarted. Will recheck")
                    container_args = []
                else:
                    raise e
            for arg in container_args:
                if 'quota-backend-bytes=' in arg:
                    quota_in_bytes = int(arg.split("=")[1])
                    machines_quotas_map[machine.name] = quota_in_bytes
                    break
            else:
                machines_quotas_map[machine.name] = None
        is_applied = all(v == cluster_spec_quota_in_bytes for k, v in machines_quotas_map.items())
        if not is_applied:
            LOG.info(f"etcd quota is not applied yet. For every control machine should "
                     f"be value {cluster_spec_quota_in_bytes}, but got \n{yaml.dump(machines_quotas_map)}")
        return is_applied

    def check_no_leftovers_after_upgrade(self):
        """Check that no old lcm-agent binaries and packages marked for autoremove left

        :return:
        """
        LOG.info(f"Checking leftovers of lcm-agent on not ubuntu machines and "
                 f"packages marked for autoremove on ubuntu machines in cluster "
                 f"{self.cluster.namespace}/{self.cluster.name}")
        machines = self.cluster.get_machines()
        ub_machines = [machine for machine in machines if machine.host_platform == 'ubuntu']

        rem_err = ''
        agent_err = ''

        re_remove_candidate = re.compile(r'^Remv\s+(?P<name>.*)\s+\[(?P<version>.*)\]$')
        re_kernel_package = re.compile(
            r'^linux-(image|headers|modules|modules-extra)-[\d\.\-]+(-generic)?$')

        inform_cmd = "apt-get -s autoremove"
        for machine in ub_machines:
            LOG.info(f"Dry-run of apt-get autoremove on ubuntu {machine.name}/{machine.namespace}")
            if machine.is_disabled():
                LOG.info(f"Machine {machine.namespace}/{machine.name} is disabled, skipping...")
                continue
            machine.exec_pod_cmd(inform_cmd)

            stdout = machine.exec_pod_cmd(inform_cmd)['logs']
            packages = {}

            for line in stdout.splitlines():
                match = re_remove_candidate.match(line.strip())
                if not match:
                    continue

                pkg_name = match.group('name')
                pkg_version = match.group('version')

                match = re_kernel_package.match(pkg_name)
                if match:
                    LOG.info(f"Skipping kernel package {pkg_name} [{pkg_version}]")
                    continue

                packages[pkg_name] = pkg_version

            if packages:
                rem_err += ("\nPackages marked for autoremove exists on ubuntu machine "
                            "{}: {}").format(machine.name, ', '.join(packages.keys()))

        inform_cmd = "ls -la /usr/local/bin | grep lcm-agent"
        check_cmd = "ls -la /usr/local/bin | grep lcm-agent | wc -l"
        for machine in machines:
            LOG.info(f"Lcm-agents in /usr/local/bin on not ubuntu machine {machine.name}/{machine.namespace}")
            if machine.is_disabled():
                LOG.info(f"Machine {machine.namespace}/{machine.name} is disabled, skipping...")
                continue
            machine.exec_pod_cmd(inform_cmd)
            ret = machine.exec_pod_cmd(check_cmd)['logs']
            if int(ret) != 1:
                agent_err += f"\nMultiple entities of lcm-agent bin found on not ubuntu machine {machine.name}"

        assert not rem_err, f"Check of marked for autoremove packages failed for cluster " \
                            f"{self.cluster.namespace}/{self.cluster.name}\n" \
                            f"{rem_err}"
        assert not agent_err, f"Check of multiple agent entities failed for cluster " \
                              f"{self.cluster.namespace}/{self.cluster.name}\n" \
                              f"{agent_err}"

    def check_actual_expected_distribution(self):
        if settings.SKIP_EXPECTED_DISTRIB_CHECK:
            LOG.warning('check_actual_expected_distribution: skipped')
            return
        wrong_distr = {}
        machines_distributions = self.cluster.get_machines_distributions_from_nodes()
        allowed_distr = self.cluster._manager.get_allowed_distributions(self.cluster.clusterrelease_version)
        allowed_ids = [v.get('id', '') for k, v in allowed_distr.items()]
        LOG.info("Check for distribution from machine spec is the same as deployed")
        for machine_name, distribution in machines_distributions.items():
            distribution_from_spec = self.cluster.get_machine(
                machine_name).data['spec']['providerSpec']['value']['distribution']
            if distribution_from_spec != distribution:
                wrong_distr[machine_name] = {'distribution_from_spec': distribution_from_spec,
                                             'distribution_deployed': distribution}
        assert not wrong_distr, (f"Next machines have missmatch between distribution in "
                                 f"machine spec and deployed distribution:\n{yaml.dump(wrong_distr)}")
        LOG.info("Check for deployed distribution exists in allowed list")
        for machine_name, distribution in machines_distributions.items():
            if distribution in allowed_ids:
                continue
            else:
                wrong_distr[machine_name] = {'actual': distribution, 'allowed_list': allowed_ids}
        assert not wrong_distr, (f"Next machines are deployed with distribution that not in allowedDistribution "
                                 f"list:\n{yaml.dump(wrong_distr)}")

    def get_hostosconfig_machines_status(self, hostoscfg, timeout=300, interval=10) -> list:
        def _get_hoc_machines_status():
            status = hostoscfg.data.get('status', {})
            machine_names = status.get('machinesStates', {}) if status else {}
            return list(machine_names.keys())

        LOG.info(f"Wait for machines in HostOSConfiguration:status for '{hostoscfg.name}'")
        timeout_msg = "'status' field doesn't contain machines status for passed HostOSConfiguration resource"
        waiters.wait(lambda: bool(_get_hoc_machines_status()),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=timeout_msg)
        machine_names = _get_hoc_machines_status()
        LOG.info(f"Following machines are found in HostOSConfiguration status: {machine_names}")
        return machine_names

    def check_hostosconfig_machine_selector(self, hostoscfg) -> list:
        machine_names = self.get_hostosconfig_machines_status(hostoscfg)
        # TODO(ddmitriev): check also that there were not selected Machines that don't match
        #                  the specified 'matchLabels'.
        #                  To select a Machine, all 'matchLabels' from HoC should match the machine labels.
        for machine_name in machine_names:
            machine_data = machine_name.split('/')
            machine_obj = self.cluster.get_machine(machine_data[1])
            machine_obj_labels = machine_obj.data.get('metadata', {}).get('labels', {})
            match_labels = hostoscfg.data.get('spec', {}).get('machineSelector', {}).get('matchLabels', {})
            for label, value in match_labels.items():
                if label not in machine_obj_labels or value != machine_obj_labels[label]:
                    raise Exception(f'Machine {machine_name} from hostosconfiguration.status.machineNames field '
                                    f'does not have machineSelector label(s) {match_labels}')
        return machine_names

    def _get_hostosconfig_state_item_names(self, hostoscfg_data, with_download_stateitem=True) -> list:
        check_mtype_names = []
        for config in hostoscfg_data.get('spec', {}).get('configs', []):
            if with_download_stateitem:
                check_mtype_names.append(f"host-os-download-{hostoscfg_data['metadata']['name']}-{config['module']}-"
                                         f"{config['moduleVersion']}-{config.get('phase', 'reconfigure')}")
            check_mtype_names.append(f"host-os-{hostoscfg_data['metadata']['name']}-{config['module']}-"
                                     f"{config['moduleVersion']}-{config.get('phase', 'reconfigure')}")
        return check_mtype_names

    def _check_lcmcluster_day2_machinetypes(self, hostoscfg):
        check_mtype_names = self._get_hostosconfig_state_item_names(hostoscfg.data)
        lcmcluster = self.cluster.get_lcm_cluster(self.cluster.name, self.cluster.namespace)
        for k, v in lcmcluster.data.get('spec', {}).get('machineTypes', {}).items():
            item_names = [item['name'] for item in v]
            if len(set(check_mtype_names) - set(item_names)) > 0:
                LOG.error(f'machineType {k} in LCMCluster {lcmcluster.name} does not contain'
                          f' all of {check_mtype_names} expected item names')
                return False
        return True

    def wait_lcmcluster_day2_machinetypes(self, hostoscfg, timeout=300, interval=10):
        """Wait for new machineTypes to be added into lcmcluster objects after hostosconfig resource creation
        Args:
            hostoscfg: KaaSHostOSConfiguration object
            timeout: timeout to wait
            interval: time between checks
        Returns: None
        """
        timeout_msg = 'LCM cluster machineTypes have not been updated with hostosconfiguration state items'
        waiters.wait(lambda: self._check_lcmcluster_day2_machinetypes(hostoscfg),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=timeout_msg)

    def _check_lcmmachine_day2_stateitems(self, hostoscfg, lcmmachines_timestamps_before=None,
                                          lcmmachines_timestamps_after=None, expected_error_result=False):
        lcmmachines_timestamps_before = lcmmachines_timestamps_before or {}
        lcmmachines_timestamps_after = lcmmachines_timestamps_after or {}
        hostoscfg_data = hostoscfg.data
        check_mtype_names = self._get_hostosconfig_state_item_names(hostoscfg_data)
        hostoscfg_mtype_names = self._get_hostosconfig_state_item_names(hostoscfg_data, with_download_stateitem=False)
        machine_names = self.get_hostosconfig_machines_status(hostoscfg)
        messages = []
        error = False

        if expected_error_result:
            # Only one control machine and maxWorkerUpgradeCount number of worker machines
            # can be at reconfigure phase. Find them and pass further for analysis.
            LOG.info("Expecting error in execution state item result")
            control_machines = []
            worker_machines = []
            for machine_fullname in machine_names:
                lcm_data = machine_fullname.split('/')
                lcm_obj = self.cluster.get_cluster_lcmmachine(lcm_data[1], namespace=lcm_data[0])
                lcm_obj_data = lcm_obj.data  # Read object data just once
                state_item_names = []
                for state_item in lcm_obj_data.get('spec', {}).get('stateItems', []):
                    state_item_names.append(state_item['name'])
                    # TODO(ddmitriev): check that state_item['params'] contains the data from the hostoscfg
                if len(set(check_mtype_names) - set(state_item_names)) == 0:
                    # Target machine is found. Add it to list for analysis
                    machine_data = machine_fullname.split('/')
                    machine_obj_data = self.cluster.get_machine(machine_data[1]).data  # Read object data just once
                    if machine_obj_data.get('metadata', {}).get('labels', {}).get('cluster.sigs.k8s.io/control-plane'):
                        control_machines.append(machine_fullname)
                    else:
                        worker_machines.append(machine_fullname)

            if len(control_machines) == 0 and len(worker_machines) == 0:
                LOG.warning("No machines are found for processing with expected error result")
                return False

            if len(control_machines) > 1:
                LOG.warning(f"Expected to see one or none of control-plane machine with related state items"
                            f"because of expected failure, but found machines: {control_machines}")
                return False

            if len(worker_machines) > self.cluster.max_worker_upgrade_count:
                LOG.warning("Expected to see number of worker machines with related state items less or equal "
                            f"to {self.cluster.max_worker_upgrade_count} because of expected failure,"
                            f"but found machines: {worker_machines}")
                return False

            machine_names = control_machines + worker_machines

        for machine_fullname in machine_names:
            lcm_data = machine_fullname.split('/')
            lcm_obj = self.cluster.get_cluster_lcmmachine(lcm_data[1], namespace=lcm_data[0])
            lcm_obj_data = lcm_obj.data  # Read object data just once
            state_item_names = []
            for state_item in lcm_obj_data.get('spec', {}).get('stateItems', []):
                state_item_names.append(state_item['name'])
                # TODO(ddmitriev): check that state_item['params'] contains the data from the hostoscfg
            if len(set(check_mtype_names) - set(state_item_names)) > 0:
                messages.append(f'LCMMachine {machine_fullname}: PENDING stateItems does not contain'
                                f' all of {check_mtype_names} expected item names')
                error = True
                continue
            else:
                for mtype in check_mtype_names:
                    stateitemstatus = lcm_obj_data.get('status', {}).get('stateItemStatuses', {}).get(mtype, {})
                    started_at = stateitemstatus.get('startedAt', '')
                    finished_at = stateitemstatus.get('finishedAt', '')
                    if 'startedAt' not in stateitemstatus:
                        messages.append(f"LCMMachine {machine_fullname}: PENDING stateItem '{mtype}' not started yet")
                        error = True
                        continue
                    if 'finishedAt' not in stateitemstatus:
                        messages.append(f"LCMMachine {machine_fullname}: PENDING stateItem '{mtype}' "
                                        f"started but not finished yet")
                        error = True
                        continue

                    if mtype in hostoscfg_mtype_names:
                        if lcm_obj.name in lcmmachines_timestamps_before:
                            # Ensure that the current stateItem starting and finishing time is greater
                            # than the starting and finishing time of the latest task on the LCMMachine
                            # that was applied before creating the current HostOSConfiguration
                            latest_started_datetime = utils.get_datetime_utc(
                                lcmmachines_timestamps_before[lcm_obj.name]['latest_startedAt'])
                            latest_finished_datetime = utils.get_datetime_utc(
                                lcmmachines_timestamps_before[lcm_obj.name]['latest_finishedAt'])
                            current_started_datetime = utils.get_datetime_utc(started_at)
                            current_finished_datetime = utils.get_datetime_utc(finished_at)
                            if current_started_datetime <= latest_started_datetime:
                                messages.append(f"LCMMachine {machine_fullname}: PENDING stateItem '{mtype}' "
                                                f"not started yet, stateItemStatus contains old data")
                                error = True
                                continue
                            if current_finished_datetime <= latest_finished_datetime:
                                messages.append(f"LCMMachine {machine_fullname}: PENDING stateItem '{mtype}' started "
                                                f"but not finished yet, stateItemStatus contains old data")
                                error = True
                                continue
                        if lcm_obj.name in lcmmachines_timestamps_after:
                            # Ensure that the current stateItem starting and finishing time are the same
                            # as the starting and finishing time for this stateItem passed via
                            # lcmmachines_timestamps_after parameter, e.g. that stateItem was not executed
                            # multiple times (or periodically) after HostOSConfiguration object creation
                            saved_started_at = (lcmmachines_timestamps_after[lcm_obj.name]
                                                ['phases']
                                                [mtype]
                                                ['startedAt'])
                            saved_finished_at = (lcmmachines_timestamps_after[lcm_obj.name]
                                                 ['phases']
                                                 [mtype]
                                                 ['finishedAt'])
                            saved_started_at_datetime = utils.get_datetime_utc(saved_started_at)
                            saved_finished_at_datetime = utils.get_datetime_utc(saved_finished_at)
                            current_started_datetime = utils.get_datetime_utc(started_at)
                            current_finished_datetime = utils.get_datetime_utc(finished_at)
                            if current_started_datetime != saved_started_at_datetime:
                                messages.append(f"LCMMachine {machine_fullname}: stateItem '{mtype}' "
                                                f"Current 'startedAt' is not equal to expected time "
                                                f"from 'lcmmachines_timestamps_after' parameter. "
                                                f"stateItem has been re-started unexpectedly.\n"
                                                f"Expected 'startedAt' time: {saved_started_at}\n"
                                                f"Expected 'finishedAt' time: {saved_finished_at}\n"
                                                f"Actual 'startedAt' time: {started_at}\n"
                                                f"Actual 'finishedAt' time: {finished_at}")
                                error = True
                                continue
                            if current_finished_datetime != saved_finished_at_datetime:
                                messages.append(f"LCMMachine {machine_fullname}: stateItem '{mtype}' "
                                                f"Current 'finishedAt' is not equal to expected time "
                                                f"from 'lcmmachines_timestamps_after' parameter. "
                                                f"stateItem has been re-started unexpectedly.\n"
                                                f"Expected 'startedAt' time: {saved_started_at}\n"
                                                f"Expected 'finishedAt' time: {saved_finished_at}\n"
                                                f"Actual 'startedAt' time: {started_at}\n"
                                                f"Actual 'finishedAt' time: {finished_at}")
                                error = True
                                continue

                    # Check the stateItem exit code and log the ansible errors if exist
                    exitCode = stateitemstatus.get('exitCode')
                    attempt = stateitemstatus.get('attempt')
                    if expected_error_result and mtype in hostoscfg_mtype_names:
                        # Expect stateItemStatus in error state
                        if exitCode == 0:
                            messages.append(f"LCMMachine {machine_fullname}: ERROR stateItem '{mtype}' "
                                            f"has zero exitCode={exitCode}, while expected an error exit code")
                            error = True
                            continue
                        message = stateitemstatus.get('message', '')
                        filtered_message_lines = utils.filter_ansible_log(message)
                        messages.append(f"LCMMachine {machine_fullname}: EXPECTED stateItem '{mtype}' "
                                        f"non-zero exitCode={exitCode} at {finished_at}")
                        for fline in filtered_message_lines:
                            messages.append(f"  >> {fline}")

                    else:
                        # Expect stateItemStatus in success state
                        if exitCode != 0:
                            messages.append(f"LCMMachine {machine_fullname}: ERROR stateItem '{mtype}' "
                                            f"has non-zero exitCode={exitCode}, attempt={attempt}  "
                                            f"from:{started_at}  to:{finished_at}")
                            message = stateitemstatus.get('message', '')
                            filtered_message_lines = utils.filter_ansible_log(message)
                            for fline in filtered_message_lines:
                                messages.append(f"  >> {fline}")
                            error = True
                            continue
                        # Check passed for the stateItem
                        messages.append(f"LCMMachine {machine_fullname}: APPLIED stateItem '{mtype}' at {finished_at}")

        if messages:
            messages_str = '\n'.join(messages)
            LOG.info(f"state messages:\n{messages_str}")

        return not error

    def wait_lcmmachine_day2_stateitems(self, hostoscfg, lcmmachines_timestamps_before=None,
                                        lcmmachines_timestamps_after=None, expected_error_result=False,
                                        timeout=settings.CHECK_LCMMACHINE_DAY2_STATEITEMS_TIMEOUT,
                                        interval=60):
        """Wait for new stateItems to be added into lcmmachine objects after hostosconfig resource creation
        Args:
            hostoscfg: KaaSHostOSConfiguration object
            timeout: timeout to wait
            interval: time between checks
        Returns: None
        """
        timeout_msg = 'LCM machines did not apply hostosconfiguration modules correctly'
        waiters.wait(lambda: self._check_lcmmachine_day2_stateitems(hostoscfg,
                                                                    lcmmachines_timestamps_before,
                                                                    lcmmachines_timestamps_after,
                                                                    expected_error_result),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=timeout_msg)

    def _check_lcmmachine_day2_stateitemsoverwrites(self, hostoscfg_data, absent=False):
        check_mtype_names = self._get_hostosconfig_state_item_names(hostoscfg_data)
        status = hostoscfg_data.get('status', {})
        machine_names_dict = status.get('machinesStates', {}) if status else {}
        machine_names = machine_names_dict.keys()
        messages = []
        error = False
        for machine_fullname in machine_names:
            lcm_data = machine_fullname.split('/')
            lcm_obj = self.cluster.get_cluster_lcmmachine(lcm_data[1], namespace=lcm_data[0])
            lcm_obj_data = lcm_obj.data
            state_item_names = []
            for state_item in lcm_obj_data.get('spec', {}).get('stateItemsOverwrites', {}).keys():
                state_item_names.append(state_item)
            # Check that stateItems from hostoscfg_data are absent in spec.stateItemsOverwrites
            # of LCMMacine, f.e. after hostoscfg object removal
            if absent:
                if (set(state_item_names) - set(check_mtype_names)) != set(state_item_names):
                    messages.append(f'LCMMachine {machine_fullname}: some stateItemsOverwrites from '
                                    f"HostOSConfiguration object {hostoscfg_data['metadata']['name']} are "
                                    f'still present after its deletion: '
                                    f'{set(state_item_names) & set(check_mtype_names)}')
                    error = True
                    continue
            # Check that all stateItems from hostoscfg_data are present in spec.stateItemsOverwrites
            # of LCMMacine, f.e. after hostoscfg object removal
            else:
                if len(set(check_mtype_names) - set(state_item_names)) > 0:
                    messages.append(f'LCMMachine {machine_fullname}: stateItemsOverwrites does not contain'
                                    f' all of {check_mtype_names} expected item names in {state_item_names}')
                    error = True
                    continue

        if messages:
            messages_str = '\n'.join(messages)
            LOG.info(f"\n{messages_str}")

        return not error

    def wait_lcmmachine_day2_stateitemsoverwrites(self, hostoscfg_data, absent=False, timeout=600, interval=10):
        """Wait for stateItemsOverwrites to be removed from lcmmachine object
        after hostosconfig resource deletion (absent=True)
        Wait for stateItemsOverwrites to be added/not removed from lcmmachine object
        after hostosconfig resource creation (absent=False)
        Args:
            hostoscfg_data: HostOSConfiguration object data (dict), not an object itself
            absent: flag determining if we should check absence or presence of stateItemsOverwrites from hostoscfg_data
            timeout: timeout to wait
            interval: time between checks
        Returns: None
        """
        if absent:
            timeout_msg = 'LCM machines still contain stateItemsOverwrites from deleted hostosconfiguration resource'
        else:
            timeout_msg = 'LCM machines still do not have all stateItemsOverwrites from created/updated \
                           hostosconfiguration resource'
        waiters.wait(lambda: self._check_lcmmachine_day2_stateitemsoverwrites(hostoscfg_data, absent),
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=timeout_msg)

    def _get_hostosconfig_module_spec(self, module_name, hostoscfg):
        for module_spec in hostoscfg.data.get('spec', {}).get('configs', []):
            if module_spec['module'] == module_name:
                return module_spec
        return {}

    def check_day2_module_results(self, module_name, hostoscfg, chosen_machine_names):
        if module_name == 'sysctl':
            LOG.info("Running _check_day2_sysctl_module_results()")
            self._check_day2_sysctl_module_results(hostoscfg, chosen_machine_names)
        elif module_name == 'package':
            LOG.info("Running _check_day2_package_module_results()")
            self._check_day2_package_module_results(hostoscfg, chosen_machine_names)
        elif module_name == 'tmpfile':
            LOG.info("Running _check_day2_tmpfile_module_results()")
            self._check_day2_tmpfile_module_results(hostoscfg, chosen_machine_names)
        elif module_name == 'irqbalance':
            LOG.info("Running _check_day2_irqbalance_module_results()")
            self._check_day2_irqbalance_module_results(hostoscfg, chosen_machine_names)
        elif module_name == 'grub_settings':
            LOG.info("Running _check_day2_grub_settings_module_results()")
            self._check_day2_grub_settings_module_results(hostoscfg, chosen_machine_names)
        elif module_name == 'cpushield':
            LOG.info("Running _check_day2_cpushield_module_results()")
            self._check_day2_cpushield_module_results(hostoscfg, chosen_machine_names)
        else:
            raise Exception(f"Unsupported module name {module_name}")

    def _check_day2_sysctl_module_results(self, hostoscfg, chosen_machine_names):
        """Check that all sysctl options were correctly applied on chosen machines
        and were not changed on other machines
        WARN: This method works for sysctl-1.1.0 module only
        Args:
            hostoscfg: KaaSHostOSConfiguration object
            chosen_machine_names: list of machines picked up via machineSelector.
                                  Use result of check_hostosconfig_machine_selector()
                                  function to get validated list of machines
        Returns: None
        """
        sysctl_module = self._get_hostosconfig_module_spec('sysctl', hostoscfg)
        if not sysctl_module:
            raise Exception("No 'sysctl' module under .spec.configs of passed hostoscfg object")
        sysctl_values = sysctl_module.get('values', {})
        sysctl_filename = sysctl_values.get('filename', '')
        sysctl_opts = sysctl_values.get('options', {})
        if not sysctl_opts:
            raise Exception("No 'values.options' passed for 'sysctl' module in hostoscfg object")
        data = {}
        for machine in self.cluster.get_machines():
            machine_full_name = f'{machine.namespace}/{machine.name}'
            data[machine_full_name] = {}
            if sysctl_filename:
                sysctl_filepath = f'/etc/sysctl.d/{sysctl_filename}.conf'
                file_exists_non_zero = machine.exec_pod_cmd(
                    f'test -s {sysctl_filepath}',
                    verbose=False)['exit_code']
                if machine_full_name in chosen_machine_names and file_exists_non_zero != 0:
                    LOG.error(f'Machine {machine_full_name} should have sysctl file {sysctl_filepath}'
                              f' due to values.filename option specified for sysctl module'
                              f' but this file is absent/has zero size')
                    raise Exception('Some of sysctl files were not created by hostosconfig resource while should be')
            cmd_out = machine.exec_pod_cmd(
                "sysctl " + " ".join(sysctl_opts.keys()),
                verbose=False)['logs'].strip()
            data[machine_full_name]['sysctl'] = {}
            for sysctl_output_line in cmd_out.split('\n'):
                sysctl_output_list = sysctl_output_line.split(' = ')
                data[machine_full_name]['sysctl'][sysctl_output_list[0]] = sysctl_output_list[1]
        for machine, output in data.items():
            for sysctl_name, sysctl_val in output.get('sysctl', {}).items():
                if machine in chosen_machine_names:
                    if sysctl_val != str(sysctl_opts[sysctl_name]):
                        LOG.error(f'Machine {machine} should have sysctl opt {sysctl_name} set to'
                                  f' {sysctl_opts[sysctl_name]} by hostosconfig resource but'
                                  f' actually {sysctl_name} = {sysctl_val}')
                        raise Exception('Some of sysctl options were not set by hostosconfig resource while should be')
                else:
                    if sysctl_val == str(sysctl_opts[sysctl_name]):
                        LOG.error(f'sysctl option {sysctl_name} on machine {machine} should not have'
                                  f'value {sysctl_opts[sysctl_name]} by default')
                        raise Exception('Some of sysctl options were changed by hostosconfig resource on'
                                        ' machines which should not be managed by it')

    def _check_day2_package_module_results(self, hostoscfg, chosen_machine_names):
        """Check that all packages were correctly installed on chosen machines
        and were not - on other machines
        WARN: This method works for package-1.1.0 module only
        Args:
            hostoscfg: KaaSHostOSConfiguration object
            chosen_machine_names: list of machines picked up via machineSelector.
                                  Use result of check_hostosconfig_machine_selector()
                                  function to get validated list of machines
        Returns: None
        """
        package_module = self._get_hostosconfig_module_spec('package', hostoscfg)
        if not package_module:
            raise Exception("No 'package' module under .spec.configs of passed hostoscfg object")
        package_opts = package_module.get('values', {}).get('packages', {})
        if not package_opts:
            raise Exception("No 'values.packages' passed for 'package' module in hostoscfg object")
        data = {}
        dpkg_cmd = ''
        package_states = {}
        yaml_true_regex = re.compile(r'^(y|Y|yes|Yes|YES|true|True|TRUE|on|On|ON)$')
        for p in package_opts:
            dpkg_cmd += f"dpkg -s {p['name']} >/dev/null 2>&1; echo {p['name']}:$?;"
            package_states[p['name']] = {'state': p.get('state', 'present'), 'purge': p.get('purge', 'no')}
        for machine in self.cluster.get_machines():
            machine_full_name = f'{machine.namespace}/{machine.name}'
            data[machine_full_name] = {}
            cmd_out = machine.exec_pod_cmd(
                dpkg_cmd,
                verbose=False)['logs'].strip()
            data[machine_full_name]['packages'] = {}
            for pkg_output_line in cmd_out.split('\n'):
                pkg_output_list = pkg_output_line.split(':')
                data[machine_full_name]['packages'][pkg_output_list[0]] = pkg_output_list[1]
        for machine, output in data.items():
            for pkg_name, pkg_status in output.get('packages', {}).items():
                if machine in chosen_machine_names:
                    if pkg_status == '1' and package_states[pkg_name]['state'] == 'present':
                        LOG.error(f'{pkg_name} package should be installed on machine {machine}'
                                  f' by hostosconfig resource but it is still absent')
                        raise Exception('Some of packages are still absent'
                                        ' while should be installed by hostosconfig resource')
                    if pkg_status == '0' and (package_states[pkg_name]['state'] == 'absent'
                                              and yaml_true_regex.match(package_states[pkg_name]['purge'])):
                        LOG.error(f'{pkg_name} package should be removed from machine {machine}'
                                  f' by hostosconfig resource but it is still installed')
                        raise Exception('Some of packages are still absent'
                                        ' while should be installed by hostosconfig resource')
                else:
                    if pkg_status == '0' and package_states[pkg_name]['state'] == 'present':
                        LOG.error(f'{pkg_name} package should NOT be installed by default on machine {machine}')
                        raise Exception('Some of packages are installed by hostosconfig'
                                        ' resource on machines which should not be managed by it')
                    if pkg_status == '1' and (package_states[pkg_name]['state'] == 'absent'
                                              and yaml_true_regex.match(package_states[pkg_name]['purge'])):
                        LOG.error(f'{pkg_name} package has been removed from machine {machine}'
                                  f' by hostosconfig resource but this machine should NOT be touched')
                        raise Exception('Some of packages are removed by hostosconfig'
                                        ' resource on machines which should not be managed by it')

    def _check_day2_tmpfile_module_results(self, hostoscfg, chosen_machine_names):
        """Check that all tmp files were created on chosen machines
        and were not - on other machines
        Args:
            hostoscfg: KaaSHostOSConfiguration object
            chosen_machine_names: list of machines picked up via machineSelector.
                                  Use result of check_hostosconfig_machine_selector()
                                  function to get validated list of machines
        Returns: None
        """
        tmpfile_module = self._get_hostosconfig_module_spec('tmpfile', hostoscfg)
        if not tmpfile_module:
            raise Exception("No 'tmpfile' module under .spec.configs of passed hostoscfg object")
        tmpfile_secret_values = tmpfile_module.get('secretValues', {})
        if tmpfile_secret_values:
            tmpfile_secret_name = tmpfile_secret_values.get('name', '')
            tmpfile_secret_ns = tmpfile_secret_values.get('namespace', '')
            if not tmpfile_secret_name or not tmpfile_secret_ns:
                raise Exception("secretValues is used, but no 'secretValues.name or secretValues.namespace' option "
                                "passed for 'tmpfile' module in hostoscfg object")
            if self._cluster._manager.api.secrets.present(name=tmpfile_secret_name, namespace=tmpfile_secret_ns):
                tmpfile_secret = self._cluster._manager.api.secrets.get(name=tmpfile_secret_name,
                                                                        namespace=tmpfile_secret_ns)
            else:
                raise Exception(f"secretValues is used, but {tmpfile_secret_ns}/{tmpfile_secret_name} is absent!")
            tmpfile_filename = base64.b64decode(tmpfile_secret.data.get('data', {}).
                                                get('filename', '')).decode("utf-8")
        else:
            tmpfile_values = tmpfile_module.get('values', {})
            tmpfile_filename = tmpfile_values.get('filename', '')
        if not tmpfile_filename:
            raise Exception("No 'filename' option passed for 'tmpfile' module in hostoscfg object")
        data = {}
        for machine in self.cluster.get_machines():
            machine_full_name = f'{machine.namespace}/{machine.name}'
            data[machine_full_name] = {}
            tmpfile_filepath = f'/tmp/{tmpfile_filename}'
            cmd_exitcode = machine.exec_pod_cmd(
                f'test -s {tmpfile_filepath}',
                verbose=True)['exit_code']
            data[machine_full_name]['tmpfile'] = {tmpfile_filename: cmd_exitcode}
        for machine, output in data.items():
            for tmpfile_name, tmpfile_status in output.get('tmpfile', {}).items():
                if machine in chosen_machine_names:
                    if tmpfile_status != 0:
                        LOG.error(f'Machine {machine} should have file {tmpfile_name} created'
                                  f' but file does not exist or is empty')
                        raise Exception('Some of temporary files were not created by '
                                        'hostosconfig resource while should be')
                else:
                    if tmpfile_status == 0:
                        LOG.error(f'Temporary file {tmpfile_name} exists on machine {machine} while'
                                  f' it should not be there')
                        raise Exception('Some of temporary files were created by hostosconfig resource on'
                                        ' machines which should not be managed by it')

    def _check_day2_irqbalance_module_results(self, hostoscfg, chosen_machine_names):
        """Check configuration file for irqbalance on hosts and compare data from hostcfg object
        Args:
            hostoscfg: KaaSHostOSConfiguration object
            chosen_machine_names: list of machines picked up via machineSelector.
                                  Use result of check_hostosconfig_machine_selector()
                                  function to get validated list of machines
        Returns: None
        """
        irqbalance_module = self._get_hostosconfig_module_spec('irqbalance', hostoscfg)
        if not irqbalance_module:
            raise Exception("No 'irqbalance' module under .spec.configs of passed hostoscfg object")
        supported_opts_map = {'banned_cpulist': 'IRQBALANCE_BANNED_CPULIST', 'args': 'IRQBALANCE_ARGS'}
        irqbalance_values = irqbalance_module.get('values', {})
        wrong_values = {}

        machines = self.cluster.get_machines()
        choosen_machines = [m for m in machines if f'{m.namespace}/{m.name}' in chosen_machine_names]
        for machine in choosen_machines:
            machine_name = machine.name
            irq_conf = machine.exec_pod_cmd('cat /etc/default/irqbalance')['logs']
            for line in irq_conf.splitlines():
                for k, v in irqbalance_values.items():
                    if k in supported_opts_map.keys() and line.startswith(supported_opts_map[k]):
                        option = line.split('=')[0]
                        value = line.split(str(option)+"=")[1]
                        if value == v:
                            LOG.info(
                                f"IrqBalance option {option} with value {value} for machine {machine_name} as expected")
                            continue
                        else:
                            LOG.error(
                                f"IrqBalance option {option} with value {value} "
                                f"for machine {machine_name} not as expected. Expected: {supported_opts_map[k]}={v}")
                            wrong_values.setdefault(
                                machine.name, {'actual': {}, 'expected': {}})['actual'][option] = str(value)
                            wrong_values[machine_name]['expected'][k] = str(v)
                            LOG.info(f"\n{yaml.dump(wrong_values[machine_name])}")

            if 'enabled' in irqbalance_values.keys() and not irqbalance_values['enabled']:
                # enabled:false was passed. Need to check daemon is disabled
                LOG.info("Got enabled:false in HOC spec for irqbalance. Will check that service disabled")
                systemctl_status_irq = machine.exec_pod_cmd('systemctl status irqbalance')['exit_code']
                if systemctl_status_irq == 0:
                    """0 program is running or service is OK
                       1 program is dead and /var/run pid file exists
                       2 program is dead and /var/lock lock file exists
                       3 program is not running"""
                    LOG.error("irqbalance is active, but should be disabled")
                    wrong_values.setdefault(
                        machine.name, {'actual': {}, 'expected': {}})['actual']['service_enabled'] = True
                    wrong_values[machine.name]['expected']['service_enabled'] = False
                elif systemctl_status_irq != 3:
                    msg = f"Unexpected exit code for irqbalance service status. Exite code: {systemctl_status_irq}"
                    raise RuntimeError(msg)
                else:
                    LOG.info("Service disabled, as expected")

        assert not wrong_values, f"IRQBALANCE config on host is not as expected. \n{yaml.dump(wrong_values)}"

    def _check_day2_grub_settings_module_results(self, hostoscfg, chosen_machine_names):
        """Check that all Grub options were correctly applied on chosen machines
        and were not changed on other machines
        Args:
            hostoscfg: KaaSHostOSConfiguration object
            chosen_machine_names: list of machines picked up via machineSelector.
                                  Use result of check_hostosconfig_machine_selector()
                                  function to get validated list of machines
        Returns: None
        """
        grub_settings_module = self._get_hostosconfig_module_spec('grub_settings', hostoscfg)
        if not grub_settings_module:
            raise Exception("No 'grub_settings' module under .spec.configs of passed hostoscfg object")
        grub_settings_values = grub_settings_module.get('values', {})
        if not grub_settings_values:
            raise Exception("No 'values' passed for 'grub_settings' module in hostoscfg object")
        reset_to_defaults = grub_settings_values.get('grub_reset_to_defaults', False)
        grub_settings_opts = grub_settings_values.get('options', {})
        if not reset_to_defaults and not grub_settings_opts:
            raise Exception("Neither 'values.reset_to_defaults' nor "
                            "'values.options' passed for 'grub_settings' module in hostoscfg object")
        grub_settings_filename = grub_settings_values.get('grub_cfg_filename', '99-grub_settings_hoc_module.cfg')
        data = {}
        for machine in self.cluster.get_machines():
            machine_full_name = f'{machine.namespace}/{machine.name}'
            data[machine_full_name] = {}
            grub_settings_filepath = f'/etc/default/grub.d/{grub_settings_filename}'
            cfg_file_exists_non_zero = machine.exec_pod_cmd(
                f'test -s {grub_settings_filepath}',
                verbose=False)['exit_code']
            if machine_full_name in chosen_machine_names:
                if reset_to_defaults and cfg_file_exists_non_zero == 0:
                    LOG.error(f'Machine {machine_full_name} has Grub config file {grub_settings_filepath}'
                              f' while it should be removed due to values.grub_reset_to_defaults parameter is true')
                    raise Exception('Grub config file should be removed by hostosconfig resource while it still exists')
                if not reset_to_defaults and cfg_file_exists_non_zero != 0:
                    LOG.error(f'Machine {machine_full_name} should have Grub config file {grub_settings_filepath}'
                              f' due to values.grub_cfg_filename option specified for grub_settings module'
                              f' but this file is absent/has zero size')
                    raise Exception('Grub config file was not created by hostosconfig resource while should be')
            if not grub_settings_values.get('disable_reboot_request', False):
                reboot_request_file_exists_non_zero = machine.exec_pod_cmd(
                    'test -s /run/day2/reboot-required',
                    verbose=False)['exit_code']
                if machine_full_name in chosen_machine_names:
                    if reboot_request_file_exists_non_zero != 0:
                        LOG.error(f'Machine {machine_full_name} should have reboot request file '
                                  ' /run/day2/reboot-required due to values.disable_reboot_request option'
                                  ' is False or not set for grub_settings module'
                                  ' but this file is absent/has zero size')
                        raise Exception('Reboot request file was not created/updated'
                                        ' by hostosconfig resource while should be')
                    else:
                        reboot_request_file_valid_content = machine.exec_pod_cmd(
                            'grep -q /etc/default/grub.d /run/day2/reboot-required',
                            verbose=False)['exit_code']
                        if reboot_request_file_valid_content != 0:
                            LOG.error(f'Machine {machine_full_name} has reboot request file /run/day2/reboot-required'
                                      ' but this file has not been updated with module-specific reason line')
                            raise Exception('Reboot request file exists but does not contain reboot reason'
                                            ' from grub_settings module')
            if not reset_to_defaults:
                cmd_out = machine.exec_pod_cmd(
                    "cat " + grub_settings_filepath,
                    verbose=False)
                data[machine_full_name]['grub_settings'] = {}
                if cmd_out['exit_code'] == 0:
                    for grub_settings_output_line in cmd_out['logs'].strip().split('\n'):
                        grub_settings_output_list = grub_settings_output_line.split('=', 1)
                        data[machine_full_name]['grub_settings'][grub_settings_output_list[0].lower()] = \
                            grub_settings_output_list[1].replace("'", '').replace('"', '')
        if not reset_to_defaults:
            for machine, output in data.items():
                if (output.get('grub_settings', {}).keys() != grub_settings_opts.keys()
                        and machine in chosen_machine_names):
                    LOG.error(f'List of {machine} Grub options in {grub_settings_filename} differs'
                              f' from those specified in HOC object for grub_settings module')
                    raise Exception('Some of custom Grub options set on machine are not the same'
                                    ' as in hostosconfig resource')
                for grub_opt_name, grub_opt_val in output.get('grub_settings', {}).items():
                    if machine in chosen_machine_names:
                        if grub_opt_name == 'grub_cmdline_linux' or grub_opt_name == 'grub_cmdline_linux_default':
                            grub_settings_opt_val = ' '.join(grub_settings_opts[grub_opt_name])
                        else:
                            grub_settings_opt_val = str(grub_settings_opts[grub_opt_name])
                        if grub_opt_val != grub_settings_opt_val:
                            LOG.error(f'Machine {machine} should have Grub opt {grub_opt_name} set to'
                                      f' {grub_settings_opts[grub_opt_name]} by hostosconfig resource but'
                                      f' actually {grub_opt_name} = {grub_opt_val}')
                            raise Exception('Some of Grub options have values that do not match with'
                                            ' hostosconfig resource parameters')
                    else:
                        if grub_opt_val == str(grub_settings_opts[grub_opt_name]):
                            LOG.error(f'Grub option {grub_opt_name} on machine {machine} should not have'
                                      f'value {grub_settings_opts[grub_opt_name]} by default')
                            raise Exception('Some of Grub options were changed by hostosconfig resource on'
                                            ' machines which should not be managed by it')

    def _check_day2_cpushield_module_results(self, hostoscfg, chosen_machine_names):
        """Check that all systemd CPU/NUMA shielding options were correctly applied on chosen machines
        and were not changed on other machines
        Args:
            hostoscfg: KaaSHostOSConfiguration object
            chosen_machine_names: list of machines picked up via machineSelector.
                                  Use result of check_hostosconfig_machine_selector()
                                  function to get validated list of machines
        Returns: None

        NOTE: There is no sense to create more than one HostOSConfiguration object with this module per machine, as
        systemd drop-in configuration file name is hardcoded to '99-shielding.conf'. CPU cores/NUMA nodes from newer
        HOC object will completely regenerate that config file. User is warned about this behavior in module doc page
        https://gerrit.mcp.mirantis.com/c/kaas/kaas-docs/+/209827
        """
        cpushield_module = self._get_hostosconfig_module_spec('cpushield', hostoscfg)
        if not cpushield_module:
            raise Exception("No 'cpushield' module under .spec.configs of passed hostoscfg object")
        cpushield_values = cpushield_module.get('values', {})
        if not cpushield_values:
            raise Exception("No 'values' passed for 'cpushield' module in hostoscfg object")
        disable_old_shield_service = cpushield_values.get('disable_old_shield_service', False)
        old_shield_service_name = cpushield_values.get('old_shield_service_name', 'shield-cpus.service')
        cpushield_dropin_cfg = '99-shielding.conf'
        for machine in self.cluster.get_machines():
            machine_full_name = f'{machine.namespace}/{machine.name}'
            # Check if old shield service is disabled
            if disable_old_shield_service:
                old_service_enabled = machine.exec_pod_cmd(
                    f'systemctl --quiet is-enabled {old_shield_service_name}',
                    verbose=False)['exit_code']
                if machine_full_name in chosen_machine_names:
                    if old_service_enabled == 0:
                        LOG.error(f'Machine {machine_full_name} should have'
                                  f' systemd service {old_shield_service_name}'
                                  f' disabled due to values.disable_old_shield_service option'
                                  f' is True but this service is still enabled')
                        raise Exception('Old-style systemd shielding service should be disabled by '
                                        'hostosconfig resource while it is enabled')
            # Check if script for init.scope is present
            init_scope_script_path = '/usr/bin/shield-cpu-numa-systemd-init-scope.sh'
            init_scope_service = 'shield-cpu-numa-systemd-init-scope.service'
            init_scope_service_enabled = machine.exec_pod_cmd(
                f'test -s {init_scope_script_path} && systemctl --quiet is-enabled {init_scope_service}',
                verbose=False)['exit_code']
            if machine_full_name in chosen_machine_names:
                if init_scope_service_enabled != 0:
                    LOG.error(f'Machine {machine_full_name} should have'
                              f' systemd service {init_scope_service}'
                              f' enabled to pin PID 1 onto chosen CPU core/NUMA node'
                              f' but this service is not enabled (or does not exist)')
                    raise Exception('Systemd init.scope shielding service should be enabled by '
                                    'hostosconfig resource while it is not')
            else:
                if init_scope_service_enabled == 0:
                    LOG.error(f'Machine {machine_full_name} has systemd service {init_scope_service}'
                              f' enabled while it should not be present on it')
                    raise Exception('Systemd init.scope shielding service has been created by'
                                    ' hostosconfig resource on machines which should not be managed by it')
            systemd_units_to_pin = cpushield_values.get('systemd_units_to_pin', [])
            for unit in systemd_units_to_pin:
                cpushield_dropin_filepath = f'/etc/systemd/system/{unit}.d/{cpushield_dropin_cfg}'
                cmd_out = machine.exec_pod_cmd(
                    f'cat {cpushield_dropin_filepath}',
                    verbose=False)
                if machine_full_name in chosen_machine_names:
                    if cmd_out['exit_code'] != 0:
                        LOG.error(f'Machine {machine_full_name} should have'
                                  f' systemd config file {cpushield_dropin_filepath}'
                                  f' corresponding to values.systemd_units_to_pin option'
                                  f' but this file is absent')
                        raise Exception(f'Systemd drop-in config file was not created by hostosconfig '
                                        f'resource for unit {unit} while should be')
                    else:
                        allowed_cfg_lines = [f'[{unit.split(".")[-1].capitalize()}]',
                                             'AllowedCPUs=' + cpushield_values.get('system_cpus')]
                        if 'system_mem_numas' in cpushield_values:
                            allowed_cfg_lines.append('AllowedMemoryNodes=' + cpushield_values.get('system_mem_numas'))
                        for output_line in cmd_out['logs'].strip().split('\n'):
                            if output_line not in allowed_cfg_lines:
                                LOG.error(f'Machine {machine_full_name} has a wrong set of'
                                          f' systemd config file {cpushield_dropin_filepath}'
                                          f' options. Only AllowedCPUs, AllowedMemoryNodes parameters'
                                          f' equal to corresponding module values are allowed')
                                raise Exception(f'Systemd drop-in config file for unit {unit} '
                                                f'contains a bad content: {output_line}')
                else:
                    if cmd_out['exit_code'] == 0:
                        LOG.error(f'Machine {machine_full_name} has systemd config file {cpushield_dropin_filepath}'
                                  f' while it should not be present on it')
                        raise Exception('Systemd config file has been created by hostosconfig resource on'
                                        ' machines which should not be managed by it')
            if not cpushield_values.get('disable_reboot_request', False):
                reboot_request_file_exists_non_zero = machine.exec_pod_cmd(
                    'test -s /run/day2/reboot-required',
                    verbose=False)['exit_code']
                if machine_full_name in chosen_machine_names:
                    if reboot_request_file_exists_non_zero != 0:
                        LOG.error(f'Machine {machine_full_name} should have reboot request file '
                                  ' /run/day2/reboot-required as cpushield module should create it by default'
                                  ' but this file is absent/has zero size')
                        raise Exception('Reboot request file was not created/updated'
                                        ' by hostosconfig resource while should be')
                    else:
                        reboot_request_file_valid_content = machine.exec_pod_cmd(
                            'grep -q cpushield /run/day2/reboot-required',
                            verbose=False)['exit_code']
                        if reboot_request_file_valid_content != 0:
                            LOG.error(f'Machine {machine_full_name} has reboot request file /run/day2/reboot-required'
                                      ' but this file has not been updated with module-specific reason line')
                            raise Exception('Reboot request file exists but does not contain reboot reason'
                                            ' from cpushield module')

    def wait_miraceph_nodes_updated(self, interval=30, timeout=120):
        """
        This method waits for nodes data from miraceph object corresponds
        nodes data from kaascephcluster object. After patching kaascephcluster
        it takes some time for data passed to miraceph. Some operations could not
        be performed until miraceph is updated
        """
        def check_miraceph_nodes_updated():
            LOG.info(f"Check that MiraCeph nodes are updated from KaasCephCluster "
                     f"'{self.cluster.namespace}/{self.cluster.name}'")
            kaasceph_node_disks_map = {}
            miraceph_node_disks_map = {}
            miraceph = self.cluster.k8sclient.miracephs.get(
                name='rook-ceph', namespace='ceph-lcm-mirantis').data.get('spec', {}).get('nodes', [])
            kaasceph = self.cluster.get_cephcluster().data.get('spec', {}).get('cephClusterSpec', {}).get('nodes', {})
            for k, v in kaasceph.items():
                k8s_node = self.cluster.get_machine(k).get_k8s_node_name()
                kaasceph_node_disks_map[k8s_node] = [disk.get('name') for disk in v.get('storageDevices', [])]
            for item in miraceph:
                node_name = item.get('name')
                miraceph_node_disks_map[node_name] = [d.get('name') for d in item.get('devices', [])]
            LOG.debug(f"Miraceph nodes info:\n{yaml.dump(miraceph_node_disks_map)}")
            LOG.debug(f"Kaasceph nodes info:\n{yaml.dump(kaasceph_node_disks_map)}")
            missing_miraceph_nodes = set(kaasceph_node_disks_map.keys()) - set(miraceph_node_disks_map.keys())
            extra_miraceph_nodes = set(miraceph_node_disks_map.keys()) - set(kaasceph_node_disks_map.keys())
            if missing_miraceph_nodes:
                LOG.warning(f"KaasCephCluster nodes {missing_miraceph_nodes} are missing in MiraCeph")
            if extra_miraceph_nodes:
                LOG.warning(f"MiraCeph contains extra nodes {extra_miraceph_nodes} "
                            f"that not present in KaasCephCluster")
            return True if kaasceph_node_disks_map == miraceph_node_disks_map else False

        waiters.wait(check_miraceph_nodes_updated, interval=interval, timeout=timeout)
        LOG.info("MiraCeph nodes are updated from KaasCephCluster successfully")

    def wait_hoc_state_item_statuses_are_changed_in_lcmmachine(self, lcmmachine, lcmmachine_timestamp_before=None,
                                                               timeout=300, interval=10):
        """
        This waiter gives us information that lcm_machine_timestamps were changed which means
        that HOC object start to changing his states
        """
        timeout_msg = f"LCM machines stateItemsStatuses section doesn't have any changes for machine {lcmmachine.name}"
        waiters.wait(lambda: not self.cluster.get_lcmmachine_timestamps(lcmmachine) == lcmmachine_timestamp_before,
                     timeout=timeout,
                     interval=interval,
                     timeout_msg=timeout_msg)

    def _check_no_k8s_leftovers_in_other_runtime(self):
        successful = True
        LOG.info('Checking leftovers runtime')
        machines = self.cluster.get_machines()
        chk_cmd_ctr = 'docker ps -a | grep k8s_'
        chk_cmd_dock = 'ctr --namespace k8s.io containers ls | tail -n +2 | grep io.containerd'
        err_msg = ''
        leftovers = ''

        for machine in machines:
            if 'containerd' in machine.runtime:
                chk_cmd = chk_cmd_ctr
            elif 'docker' in machine.runtime:
                chk_cmd = chk_cmd_dock
            else:
                # weird and probably impossible case but needs to be alse covered.
                raise RuntimeError('Machine runtime can not be determined')
            res = machine.exec_pod_cmd(chk_cmd, verbose=False)
            if res['exit_code'] == 0:
                err_msg += (f"k8s entities still exists on machine {self._cluster.name}/{machine.name} with "
                            f"runtime {machine.runtime}")
                leftovers += f"{machine.name}\n"
                leftovers += res['logs'].strip()
                leftovers += "\n---\n"

        if err_msg:
            LOG.warning(err_msg)
            LOG.warning('Leftovers in runtime found:')
            LOG.warning(leftovers)
            successful = False

        return successful

    def _check_runtime_status(self):
        """Check runtime readyness.
           True - runtime ready and in expected states.
           False - runtime migration in progress (have annotation for runtime and it does not match actual)

        :return: bool
        """
        machines = self.cluster.get_machines()
        readiness = True
        for machine in machines:
            desired_runtime = machine.annotations.get('kaas.mirantis.com/preferred-container-runtime', None)
            if not desired_runtime:
                # if not set - machine does not migrated or planned to migrate. nothing to check
                continue
            if desired_runtime not in machine.runtime:
                LOG.info(f"{machine.namespace}/{machine.name} have runtime "
                         f"{machine.runtime} while expected {desired_runtime}")
                # migration in progress on any of machines.
                readiness = False
        return readiness

    def check_runtime(self, timeout=9600, interval=20):
        expected_status = "Ready"

        def check(expected_status):
            mcnd = self._check_machines_conditions(expected_status)
            rsts = self._check_runtime_status()
            lchk = self._check_no_k8s_leftovers_in_other_runtime()

            # Runtime will be OK only if everyting in checks will be OK
            if not (mcnd and rsts and lchk):
                raise RuntimeError(f"Not all conditions for runtime readiness "
                                   f"are in status {expected_status}")
            return True

        LOG.info(f"Waiting {timeout} sec until runtime will be ready")
        waiters.wait_pass(lambda: check(expected_status),
                          timeout=timeout, interval=interval,
                          expected=(RuntimeError, ApiException,
                                    KeyError, TypeError, MaxRetryError),
                          timeout_msg=f"Runtime still not ready after {timeout} seconds")

    def compare_machines_runtime_with_desired(self, machines, machine_is_new=False):
        if not machines:
            pytest.skip("No machines to check runtime")

        failed_runtime_in_machine_object = []
        failed_runtime_on_machine = []
        for machine in machines:
            # runitme on new machine should be empty
            desired_runtime_from_annotation = machine.annotations.get(
                'kaas.mirantis.com/preferred-container-runtime', None)
            if machine_is_new:
                if desired_runtime_from_annotation:
                    raise Exception(f"New added machine contains desired_runtime: {desired_runtime_from_annotation},"
                                    f"but shouldn't")

            # check node runtime
            if machine.runtime.split(':')[0] == settings.DESIRED_RUNTIME:
                LOG.info(f"Current container runtime in node data: {machine.runtime} "
                         f"is the same as desired: {settings.DESIRED_RUNTIME}")
            else:
                failed_runtime_in_machine_object.append(machine)

            if settings.DESIRED_RUNTIME == 'containerd':
                chk_cmd = 'ctr --namespace k8s.io containers ls | tail -n +2 | grep io.containerd'
            elif settings.DESIRED_RUNTIME == 'docker':
                chk_cmd = 'docker ps -a | grep k8s_'

            res = machine.exec_pod_cmd(chk_cmd, verbose=False)
            if res['exit_code'] == 0:
                LOG.info(f"Container runtime is the same as in desired: {settings.DESIRED_RUNTIME}")
            else:
                failed_runtime_on_machine.append(machine)

        if failed_runtime_in_machine_object:
            raise Exception(f"Looks like that real machine runtime is not expected for machines"
                            f"{failed_runtime_in_machine_object}, for partially migrated cluster it should be "
                            f"docker, for fully migrated containerd")

        if failed_runtime_on_machine:
            raise Exception(f"Looks like that real machine runtime is not expected for machines: "
                            f"{failed_runtime_on_machine}, for partially migrated cluster it should be "
                            f"docker, for fully migrated containerd")

    def compare_cluster_runtime_with_desired(self):
        expected_runtime = settings.DESIRED_RUNTIME
        cluster_runtime = self.cluster.runtime.runtime
        assert cluster_runtime == expected_runtime, (f"Cluster has wrong runtime {cluster_runtime} or "
                                                     f"migration in progress. Expected runtime: {expected_runtime}")
        LOG.info(f"Cluster runtime: {cluster_runtime} is correct")
        # Check runtime on machines
        machines = self.cluster.get_machines()
        self.cluster.check.compare_machines_runtime_with_desired(machines, machine_is_new=False)
        LOG.info(f"Runtime on machines is correct: {cluster_runtime}")

    def check_bmh_inventory_presense(self):
        """For MCC 2.29.0, each BareMetalHost must have the related BareMetalHostInventory object"""
        kaas_manager = self.cluster._manager
        mgmt_cluster = kaas_manager.get_mgmt_cluster()
        kaas_version = version.parse(mgmt_cluster.get_kaasrelease_version())
        if kaas_version < version.parse("kaas-2-29-0-rc"):
            LOG.info(f"Skip BMHI check, nothing to check yet in '{kaas_version}'")
            return

        if self.cluster.provider != utils.Provider.baremetal:
            LOG.info(f"Skip BMHI check for non-BM provider '{self.cluster.provider}'")
            return

        assert kaas_manager.api.kaas_baremetalhostinventories.available, (
            f"MCC '{kaas_version}' doesn't have BareMetalHostInventory CRD, while it is expected for 2.29.0+")

        errors = []
        machines = self.cluster.get_machines()
        for machine in machines:
            bmh_name = machine.get_bmh_name()
            if not kaas_manager.api.kaas_baremetalhostinventories.present(name=bmh_name, namespace=machine.namespace):
                errors.append(f"Missing BareMetalHostInventory '{machine.namespace}/{bmh_name}' "
                              f"for Machine '{machine.namespace}/{machine.name}'")
        if not errors:
            LOG.info("BMH Inventory check passed, bmhi objects present for all Machines")
            return

        error_msg = (f"Missing BMH Inventory objects for some Machines in the Cluster "
                     f"'{self.cluster.namespace}/{self.cluster.name}':\n" + "\n".join(errors))
        LOG.error(error_msg)
        if settings.CHECK_BMH_INVENTORY:
            raise Exception(error_msg)
        else:
            LOG.warning("'CHECK_BMH_INVENTORY' disabled, ignoring the missing BMHI objects")

    def check_day1_modes(self, machines, provisioning=None, deployment=None):
        """Check that machines are in expected day1 modes
        Args:
            machines: list of KaaSMachine objects
            provisioning: expected day1 provisioning mode, if None - do not check,
                possible values: 'manual', 'auto', ''
            deployment: expected day1 deployment mode, if None - do not check,
                possible values: 'manual', 'auto', ''
        Returns: None
        """

        check_day1_provisioning = isinstance(provisioning, str)
        check_day1_deployment = isinstance(deployment, str)

        for machine in machines:
            provider_spec = machine.data.get('spec', {}).get('providerSpec', {}).get('value', {})
            if check_day1_provisioning:
                actual_day1_provisioning = provider_spec.get('day1Provisioning')
                assert actual_day1_provisioning == provisioning, \
                    (f"Machine {machine.name} has wrong day1 provisioning "
                     f"mode: {actual_day1_provisioning}, expected: {provisioning}")
            if check_day1_deployment:
                actual_day1_deployment = provider_spec.get('day1Deployment')
                assert actual_day1_deployment == deployment, \
                    (f"Machine {machine.name} has wrong day1 deployment "
                     f"mode: {actual_day1_deployment}, expected: {deployment}")
