#    Copyright 2024 Mirantis, Inc.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.
import pytest
import time
import yaml

from deepdiff import DeepDiff
from si_tests import settings
from si_tests import logger

from si_tests.managers.kaas_manager import Cluster
from si_tests.managers.kaas_manager import Manager
from si_tests.managers.netchecker_manager import NetcheckerManager

from si_tests.utils import utils, waiters, templates
from kubernetes.client.rest import ApiException
from urllib3.exceptions import MaxRetryError, ProtocolError


LOG = logger.logger


def is_cluster_management():
    cluster_name = settings.TARGET_CLUSTER
    namespace_name = settings.TARGET_NAMESPACE

    ns = Manager(settings.KUBECONFIG_PATH).get_namespace(namespace_name)
    cluster = ns.get_cluster(cluster_name)
    return cluster.is_management


cluster_is_management = is_cluster_management()


cluster_is_child = not cluster_is_management


def check_pod_respawn(target_cluster, pods, pod_name_prefix):
    ns = [pod.namespace for pod in pods][0]
    LOG.info("Check number of pods after delete procedure")
    target_cluster.check.check_pods_number(pod_name_prefix, ns, pods)
    LOG.info(f"Check status for {pod_name_prefix} pods")
    target_cluster.check.check_k8s_pods(pods_prefix=pod_name_prefix,
                                        timeout=300,
                                        interval=30)


def create_netchecker(netchecker_manager):
    netchecker_config_path = settings.NETCHECKER_FILE_PATH
    netchecker_data = yaml.safe_load(templates.render_template(netchecker_config_path))
    n_obj = netchecker_manager.create_infraconnectivitymonitor(data=netchecker_data)
    netchecker_manager.wait_infraconnectivitymonitor_status(n_obj)
    return n_obj


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                               .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures("collect_downtime_statistics")     # Should be used if ALLOW_WORKLOAD == True
@pytest.mark.usefixtures("check_heat_stacks_after_test")
def test_ha_stop_containerd(target_cluster: Cluster, cluster_condition_check, _):
    """Stop containerd service on every cluster node (one at a time)
            Precondition - all expected pods and their replicas must be presented
            The following scenario is executed for every node

            Scenario:
                1. SSH to node
                2. Get number of containerd-shim processes
                3. Stop containerd service and make sure it is stopped
                4. Get number of containerd-shim processes and
                   compare with previous value (must be the same)
                5. Check that all pods are Running and Ready
                6. Start containerd service

            Expected result - containerd-shim processes are running independently
            from containerd service and all containers in the pods are
            still operational even if containerd is down.
            """
    services = ["ovsdb-server", "ovs-vswitchd", "contrail-vrouter-agent"]
    machines = target_cluster.get_machines()
    for machine in machines:
        LOG.info(f"Accessing {machine.name}")
        proc = machine._run_cmd(
            "pgrep -x containerd-shim | wc -l",
            ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
        LOG.info(f"There are containerd-shim {proc} processes")
        LOG.info("Check do we have mentioned services on node and get their PID if yes")
        pids_before_restart = utils.get_services_pids(machine, services,
                                                      settings.HA_TEST_PRIVATE_KEY_FILE)
        if pids_before_restart:
            LOG.info(f"Services pids before restart: {pids_before_restart}")
        LOG.info("Stopping containerd")
        machine._run_cmd("sudo service containerd stop",
                         ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        waiters.wait(lambda: machine._run_cmd("pgrep -x containerd",
                                              check_exit_code=True,
                                              expected_codes=[1],
                                              ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE),
                     timeout=600,
                     interval=10,
                     timeout_msg="containerd service wasn't stopped")
        proc2 = machine._run_cmd(
            "pgrep -x containerd-shim | wc -l",
            ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
        LOG.info(f"Now it is {proc} containerd-shim processes")
        try:
            assert int(proc) == int(proc2) + 1
            target_cluster.check.check_k8s_pods(timeout=1200, interval=30)
        except Exception as e:
            LOG.error(e)
        finally:
            LOG.info("Start containerd")
            machine._run_cmd("sudo service containerd start",
                             ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)

        if pids_before_restart:
            pids_after_restart = utils.get_services_pids(machine, services,
                                                         settings.HA_TEST_PRIVATE_KEY_FILE)
            LOG.info(f"PIDs after restart: {pids_after_restart}")
            assert pids_before_restart == pids_after_restart, \
                (f"Processes {services} restarted during containerd restart"
                 f"pids before restart: {pids_before_restart}\n"
                 f"pids after restart: {pids_after_restart}")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures("collect_downtime_statistics")     # Should be used if ALLOW_WORKLOAD == True
@pytest.mark.usefixtures("check_heat_stacks_after_test")
def test_ha_kill_containerd_shim(target_cluster, cluster_condition_check, _):
    """Kill all containerd-shim processes on every cluster node
    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every node

    Scenario:
        1. SSH to node and run kill command for containerd-shim processes
        2. Wait
        3. Check that all pods are Running and Ready

    Expected result - pods and their containers are Running and Ready.
    Number of replicas are the same.
    """
    nodes = (target_cluster.get_machines())
    for node in nodes:
        LOG.info(f"Accessing {node.name}")
        node._run_cmd("for i in $(ps uax | grep containerd-shim | "
                      "grep -v grep | awk '{print $2}'); "
                      "do sudo kill -9 $i; done",
                      ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        time.sleep(60)
        LOG.info("Waiting for k8s to be ready")
        waiters.wait_pass(
            lambda: target_cluster.k8sclient.pods.list_all(), timeout=120)
        LOG.info("Waiting for pods to be in a correct state")
        target_cluster.check.check_k8s_pods(timeout=1200, interval=30)
        # TODO: maybe check pod restarts also?
        # additonal timeout ?


@pytest.mark.parametrize("_", [f"CLUSTER_NAME={settings.TARGET_CLUSTER}"])
def test_ha_delete_helm_controller_leader_pod(target_cluster, cluster_condition_check, helmbundles_check, _):
    """Detect and delete leader pod for selected app. Check that leader changed
    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for selected application

    Scenario:
        1. Detect leader
        2. Delete leader pod
        3. Wait till leader was changed

    Expected result - pods are recreated, number of replicas is restored.
    """
    app_groups = ['helm-controller', 'stacklight-helm-controller']
    for app_group in app_groups:
        pods = target_cluster.k8sclient.pods.list_all(name_prefix=app_group)
        assert len(pods) > 0, "Pods not found for selected app"
        leader_pod_name_before = target_cluster.check.get_leader(app_prefix=app_group).split("_")[0]

        LOG.info("Get leader pod and delete it")
        for pod in pods:
            if pod.name == leader_pod_name_before:
                LOG.info(f"Deleting leader pod: {pod.name}")
                pod.delete(timeout=180)

        def wait_leader_changed(leader_before=None):
            leader_pod_name_after = target_cluster.check.get_leader(app_prefix=app_group).split("_")[0]
            if leader_pod_name_after != leader_before:
                return True
            else:
                LOG.info(f"Leader is not changed yet. Should be not {leader_before}")
                return False

        waiters.wait(lambda: wait_leader_changed(leader_pod_name_before),
                     timeout=120, interval=5)


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}".format(settings.TARGET_CLUSTER)])
@pytest.mark.parametrize("svc_name,svc_ns,expected_downtime", settings.HA_SL_SVC_TIMEOUT)
def test_ha_sl_svc(target_cluster, svc_name, svc_ns, expected_downtime, _):
    """Delete svc pod or leader pod if STACKLIGHT_ENABLE_HA and check SVC downtime

     Scenario:
         1. Compare actual number of replicas for pod with expected
         2. Iterate by each replica
         3. Delete pod(s)
         4. Wait till number of replicas will be restored
         5. Check downtime
         6. Check pods statuses in this group (Running and Ready)

     Expected result - pods are recreated, number of replicas is restored.
     """

    if svc_name == "iam-proxy-kibana":
        if not target_cluster.logging_enabled():
            msg = "SL logging disabled. Kibana service is missing"
            LOG.warning(msg)
            pytest.skip(msg)

    start = time.time()
    if target_cluster.sl_ha_enabled():
        expected_downtime = settings.HA_SL_SVC_HA_MODE_DOWNTIME
        LOG.info("SL HA enabled")
    else:
        LOG.info("SL HA disabled")

    svc_pods = target_cluster.k8sclient.pods.list_starts_with(svc_name, namespace=svc_ns)
    target_pod = svc_pods[0]
    LOG.info(f"Delete {target_pod.name} pod")
    target_pod.delete()
    LOG.info(f"Wait until all {svc_name} pod(s) Created and Running")
    target_cluster.check.check_k8s_pods(pods_prefix=svc_name,
                                        target_namespaces=svc_ns,
                                        timeout=300,
                                        interval=30)
    end = time.time()
    client = target_cluster.prometheusclient
    result = client.get_svc_probe_success(namespace=svc_ns, service_name=svc_name, start=start, end=end)
    downtime = target_cluster.check.calculate_k8s_downtime(result)
    utils.check_downtime(downtime, expected_downtime)


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                               .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures("collect_downtime_statistics")     # Should be used if ALLOW_WORKLOAD == True
@pytest.mark.usefixtures("check_heat_stacks_after_test")
def test_ha_haproxy(target_cluster: Cluster, cluster_condition_check, _):
    """Check that each node can balance traffic through HAProxy

    We have to check that all HAproxy can handle outage of any nodes/services

    Scenario:
        1. Select node to serve public ip. Do for each node
        2. Stop mcc-keepalived service on other nodes
        3. Check that API works well
        4. Pause k8s API containerd service on other nodes
        5. Check that API is available
        6. Stop k8s API and mke API containerd services to drop any connection to HAProxy
        7. Check that containerd services were restored and k8s API is available
        8. Check that a pod can be scheduled on the current node
        9. Restore k8s API and keepalived on other nodes
        10. Check that API works well
    """
    if target_cluster.provider is not utils.Provider.baremetal:
        msg = ("\nHA test to check HAProxy LB service were skipped "
               "as cluster doesn't support the feature")
        LOG.info(msg)
        pytest.skip(msg)
        return

    ssh_user = 'mcc-user'
    cluster_release = target_cluster.clusterrelease_version
    LOG.info(f"Cluster release: {cluster_release}")

    ssh_key_file = settings.HA_TEST_PRIVATE_KEY_FILE
    nodes = target_cluster.get_machines(machine_type="control")
    try:
        LOG.info("Cluster has %s control nodes", [m.name for m in nodes])
        # 1. Select one control node
        for one in (nodes + [nodes[0]]):
            info = f"#  Check Haproxy on the node {one.name}  #"
            LOG.info(f"\n{'#' * len(info)}"
                     f"\n{info}"
                     f"\n{'#' * len(info)}")
            # Select other control nodes
            other_nodes = set(nodes) - set([one])
            # 2. Stop Keepalived on other nodes to move VIP to the selected control node
            LOG.info("Turning off Keepalived on %s", [m.name for m in other_nodes])
            for stop_one in other_nodes:
                LOG.info("Stop keepalived service on %s", stop_one.name)
                stop_one._run_cmd(
                    "sudo systemctl stop mcc-keepalived.service",
                    verbose=True,
                    ssh_key=ssh_key_file,
                    ssh_login=ssh_user,
                    reconnect=True)

            # 3. Check API
            target_cluster.check.check_k8s_nodes()
            target_cluster.check.check_actual_expected_pods()

            # 4. Turn off k8s API on other nodes
            LOG.info("Turning off backend k8s API services on %s", [m.name for m in other_nodes])
            for stop_one in other_nodes:
                LOG.info("Pause k8s api service on %s", stop_one.name)
                stop_one._run_cmd(
                    "sudo docker pause ucp-kube-apiserver",
                    verbose=True,
                    ssh_key=ssh_key_file,
                    ssh_login=ssh_user,
                    reconnect=True)

            LOG.info("Wait 10 seconds before HAProxy reacts to stop api services")
            time.sleep(10)

            # 5. Check API availability
            LOG.info("Getting list of nodes to check API availability")
            assert_nodes = target_cluster.get_machines(machine_type="control")

            assert set(n.name for n in nodes) == set(n.name for n in assert_nodes), \
                ("Can't fetch list of nodes. It seems that haproxy can't handle "
                 f"service off on {other_nodes}")
            LOG.info(f"Response with machines: {[n.name for n in assert_nodes]}")

            # 6. Simulate haproxy <-> ucp-controller connection failure
            #    MKE containers must be restarted automatically in a few seconds
            one._run_cmd(
                "sudo docker kill ucp-kube-apiserver; sudo docker kill ucp-controller",
                verbose=True,
                ssh_key=ssh_key_file,
                ssh_login=ssh_user,
                reconnect=True)

            # 7. Check that MKE containers were restarted and k8s API is available
            LOG.info(f"\n\nWait until k8s API becomes available again "
                     f"and VIP is returned on the machine {one.name}\n")
            waiters.wait_pass(
                target_cluster.get_machines, timeout=600, interval=10,
                expected=(ApiException, MaxRetryError, ProtocolError),
                timeout_msg=f"MKE services were not automatically restarted on the machine {one.name}")

            LOG.info("\n\n*** VIP is restored ***\n")
            LOG.info("Get the list of machines to check API availability after stopping MKE services")
            assert_nodes = target_cluster.get_machines(machine_type="control")
            assert set(n.name for n in nodes) == set(n.name for n in assert_nodes), \
                ("Can't fetch list of machines. It seems that haproxy can't handle "
                 f"service off on {other_nodes}")
            LOG.info(f"Response with machines: {[n.name for n in assert_nodes]}")

            # 8. Check that a pod can be scheduled on the current node
            #    Use exec_pod_cmd() to run a simple pod
            time.sleep(300)
            one.exec_pod_cmd("sudo docker ls | grep ucp-kube-apiserver", verbose=True)

            # 9. Restore k8s API and keepalived on all nodes
            LOG.info("Recover k8s api services on %s", [m.name for m in other_nodes])
            for start_one in other_nodes:
                LOG.info("unpause k8s api service on %s", start_one.name)
                start_one._run_cmd(
                    "sudo docker unpause ucp-kube-apiserver",
                    verbose=True,
                    ssh_key=ssh_key_file,
                    ssh_login=ssh_user,
                    reconnect=True)

            target_cluster.check.check_k8s_nodes()
            target_cluster.check.check_actual_expected_pods()

            LOG.info("Turning on Keepalived on %s", [m.name for m in other_nodes])
            for start_one in other_nodes:
                LOG.info("Start Keepalived service on %s", start_one.name)
                start_one._run_cmd(
                    "sudo systemctl start mcc-keepalived.service",
                    verbose=True,
                    ssh_key=ssh_key_file,
                    ssh_login=ssh_user,
                    reconnect=True)

            LOG.info("Wait 30 seconds before HAProxy reacts to start api services. "
                     "Should give time to restore clustered pods.")
            time.sleep(30)

            # 10. Check API
            target_cluster.check.check_k8s_nodes()
            target_cluster.check.check_actual_expected_pods()

            LOG.info("Finish checking HAProxy service on %s", one.name)
    finally:
        LOG.banner("Restore k8s and keepalived services on all machines")
        for one in nodes:
            LOG.info(f"Unpause k8s service on the node {one.name}")
            one._run_cmd(
                "sudo docker unpause ucp-kube-apiserver || true",
                verbose=True,
                ssh_key=ssh_key_file,
                ssh_login=ssh_user,
                reconnect=True)
            LOG.info(f"Start Keepalived service on the node {one.name}")
            one._run_cmd(
                "sudo systemctl start mcc-keepalived.service || true",
                verbose=True,
                ssh_key=ssh_key_file,
                ssh_login=ssh_user,
                reconnect=True)
        target_cluster.check.check_k8s_nodes()
        target_cluster.check.check_actual_expected_pods()


@pytest.mark.skipif(cluster_is_management, reason="We should skip current HA test for mgmt cluster")
@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
def test_ha_delete_openvswitch_pods(target_cluster, cluster_condition_check, _):
    """Delete openvswitchpods  in any cluster one by one
     Precondition - all expected pods and their replicas must be presented
     The following scenario is executed for every namespace and
     expected pod entry in get_expected_pods

     Scenario:
         1. Compare actual number of replicas for pod with expected
         2. Iterate by each replica
         3. Delete pod (1 replica)
         4. Wait till number of replicas will be restored
         5. Check pods statuses in this group (Running and Ready)

     Expected result - pods are recreated, number of replicas is restored.
     """
    pod_name_prefix = 'openvswitch'
    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods before delete tests: {pods}")
    for pod in pods:
        LOG.info(f"Deleting pod: {pod.name} with pod_name_prefix: {pod_name_prefix}")
        pod.delete(timeout=180)
        check_pod_respawn(target_cluster, pods, pod_name_prefix)

    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods after: {pods}")
    # Add custom checks below


@pytest.mark.skipif(cluster_is_management, reason="We should skip current HA test for mgmt cluster")
@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
def test_ha_delete_libvirt_pods(target_cluster, cluster_condition_check, _):
    """Delete libvirt in any cluster one by one
     Precondition - all expected pods and their replicas must be presented
     The following scenario is executed for every namespace and
     expected pod entry in get_expected_pods

     Scenario:
         1. Compare actual number of replicas for pod with expected
         2. Iterate by each replica
         3. Delete pod (1 replica)
         4. Wait till number of replicas will be restored
         5. Check pods statuses in this group (Running and Ready)

     Expected result - pods are recreated, number of replicas is restored.
     """
    pod_name_prefix = 'libvirt-libvirt-default'
    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods before delete tests: {pods}")
    for pod in pods:
        LOG.info(f"Deleting pod: {pod.name} with pod_name_prefix: {pod_name_prefix}")
        pod.delete(timeout=180)
        check_pod_respawn(target_cluster, pods, pod_name_prefix)

    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods after: {pods}")
    # Add custom checks below


@pytest.mark.skipif(cluster_is_management, reason="We should skip current HA test for mgmt cluster")
@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
def test_ha_delete_mariadb_pods(target_cluster, cluster_condition_check, _):
    """Delete mariadb in any cluster one by one
     Precondition - all expected pods and their replicas must be presented
     The following scenario is executed for every namespace and
     expected pod entry in get_expected_pods

     Scenario:
         1. Compare actual number of replicas for pod with expected
         2. Iterate by each replica
         3. Delete pod (1 replica)
         4. Wait till number of replicas will be restored
         5. Check pods statuses in this group (Running and Ready)

     Expected result - pods are recreated, number of replicas is restored.
     """
    pod_name_prefix = 'mariadb'
    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods before delete tests: {pods}")
    for pod in pods:
        LOG.info(f"Deleting pod: {pod.name} with pod_name_prefix: {pod_name_prefix}")
        pod.delete(timeout=180)
        check_pod_respawn(target_cluster, pods, pod_name_prefix)

    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods after: {pods}")
    # Add custom checks below


@pytest.mark.skipif(cluster_is_management, reason="We should skip current HA test for mgmt cluster")
@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
def test_ha_delete_etcd_pods(target_cluster, cluster_condition_check, _):
    """Delete etcd in any cluster one by one
     Precondition - all expected pods and their replicas must be presented
     The following scenario is executed for every namespace and
     expected pod entry in get_expected_pods

     Scenario:
         1. Compare actual number of replicas for pod with expected
         2. Iterate by each replica
         3. Delete pod (1 replica)
         4. Wait till number of replicas will be restored
         5. Check pods statuses in this group (Running and Ready)

     Expected result - pods are recreated, number of replicas is restored.
     """
    pod_name_prefix = 'etcd-etcd'
    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods before delete tests: {pods}")
    for pod in pods:
        LOG.info(f"Deleting pod: {pod.name} with pod_name_prefix: {pod_name_prefix}")
        pod.delete(timeout=180)
        check_pod_respawn(target_cluster, pods, pod_name_prefix)

    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods after: {pods}")
    # Add custom checks below


@pytest.mark.skipif(cluster_is_child, reason="We should skip current HA test for child cluster")
@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
def test_ha_delete_iam_pods(target_cluster, cluster_condition_check, _):
    """Delete iam-keycloak in any cluster one by one
     Precondition - all expected pods and their replicas must be presented
     The following scenario is executed for every namespace and
     expected pod entry in get_expected_pods

     Scenario:
         1. Compare actual number of replicas for pod with expected
         2. Iterate by each replica
         3. Delete pod (1 replica)
         4. Wait till number of replicas will be restored
         5. Check pods statuses in this group (Running and Ready)

     Expected result - pods are recreated, number of replicas is restored.
     """
    pod_name_prefix = 'iam-keycloak'
    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods before delete tests: {pods}")
    for pod in pods:
        LOG.info(f"Deleting pod: {pod.name} with pod_name_prefix: {pod_name_prefix}")
        pod.delete(timeout=180)
        check_pod_respawn(target_cluster, pods, pod_name_prefix)

    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods after: {pods}")
    # Add custom checks below


@pytest.mark.skipif(cluster_is_child, reason="We should skip current HA test for child cluster")
@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
def test_ha_delete_mcc_cache_pods(target_cluster, cluster_condition_check, _):
    """Delete mcc-cache in any cluster one by one
     Precondition - all expected pods and their replicas must be presented
     The following scenario is executed for every namespace and
     expected pod entry in get_expected_pods

     Scenario:
         1. Compare actual number of replicas for pod with expected
         2. Iterate by each replica
         3. Delete pod (1 replica)
         4. Wait till number of replicas will be restored
         5. Check pods statuses in this group (Running and Ready)

     Expected result - pods are recreated, number of replicas is restored.
     """
    pod_name_prefix = 'mcc-cache'
    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods before delete tests: {pods}")
    for pod in pods:
        LOG.info(f"Deleting pod: {pod.name} with pod_name_prefix: {pod_name_prefix}")
        pod.delete(timeout=180)
        check_pod_respawn(target_cluster, pods, pod_name_prefix)

    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods after: {pods}")
    # Add custom checks below


@pytest.mark.skipif(cluster_is_child, reason="We should skip current HA test for child cluster")
@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
def test_ha_delete_dhcp_relay_pods(target_cluster, cluster_condition_check, _):
    """Delete dhcp-relay in any cluster one by one
     Precondition - all expected pods and their replicas must be presented
     The following scenario is executed for every namespace and
     expected pod entry in get_expected_pods

     Scenario:
         1. Compare actual number of replicas for pod with expected
         2. Iterate by each replica
         3. Delete pod (1 replica)
         4. Wait till number of replicas will be restored
         5. Check pods statuses in this group (Running and Ready)

     Expected result - pods are recreated, number of replicas is restored.
     """
    pod_name_prefix = 'dhcp-relay'
    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods before delete tests: {pods}")
    for pod in pods:
        LOG.info(f"Deleting pod: {pod.name} with pod_name_prefix: {pod_name_prefix}")
        pod.delete(timeout=180)
        check_pod_respawn(target_cluster, pods, pod_name_prefix)

    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods after: {pods}")
    # Add custom checks below


@pytest.mark.skipif(cluster_is_child, reason="We should skip current HA test for child cluster")
@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
def test_ha_baremetal_operator_relay_pods(target_cluster, cluster_condition_check, _):
    """Delete baremetal-operator in any cluster one by one
     Precondition - all expected pods and their replicas must be presented
     The following scenario is executed for every namespace and
     expected pod entry in get_expected_pods

     Scenario:
         1. Compare actual number of replicas for pod with expected
         2. Iterate by each replica
         3. Delete pod (1 replica)
         4. Wait till number of replicas will be restored
         5. Check pods statuses in this group (Running and Ready)

     Expected result - pods are recreated, number of replicas is restored.
     """
    pod_name_prefix = 'baremetal-operator'
    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods before delete tests: {pods}")
    for pod in pods:
        LOG.info(f"Deleting pod: {pod.name} with pod_name_prefix: {pod_name_prefix}")
        pod.delete(timeout=180)
        check_pod_respawn(target_cluster, pods, pod_name_prefix)

    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods after: {pods}")
    # Add custom checks below


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
def test_ha_cnnc_agent_pods(target_cluster, cluster_condition_check, netchecker_cleanup_actions, _):
    """Delete cnnc-agent pods in any cluster one by one
     Precondition - all expected pods and their replicas must be presented
     The following scenario is executed for every namespace and
     expected pod entry in get_expected_pods. cnnc-agent is about metrics collection
     so we need to check metrics after pods recreation

     Scenario:
         1. Compare actual number of replicas for pod with expected
         2. Create netchecker
         3. Wait for metrics exists in prometheus
         4. Collect metrics data before pods deletion
         5. Delete cnnc-agent pods one-by-one
         6. Wait till number of replicas will be restored
         7. Check metrics are still being collected for every node
         8. Check that counters are increased after agents restarted
         9. Check pods statuses in this group (Running and Ready)

     Expected result - pods are recreated, number of replicas is restored. Metrics are collecting
     """

    netchecker = NetcheckerManager(target_cluster)
    if not netchecker.is_supported:
        pytest.skip(f"Netchecker is not supported on cluster {target_cluster.clusterrelease_version}")
    if not netchecker.is_enabled:
        pytest.skip("Netchecker is disabled. Check spec.providerSpec.value.disableNetChecker value")

    pod_name_prefix = 'cnnc-agent'
    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods before delete tests: {[pod.name for pod in pods]}")

    LOG.info("Check netchecker created succesfully after agents recreation")
    create_netchecker(netchecker_manager=netchecker)

    def check_all_source_nodes_exists_in_metrics(cluster, query):
        machines = cluster.get_machines()
        k8s_nodes_name = [m.get_k8s_node_name() for m in machines]
        all_metrics = cluster.prometheusclient.get_query(query=query)

        source_node_total = []
        for m in all_metrics:
            source_node = m.get('metric', {}).get('source_node', '')
            if source_node not in source_node_total:
                source_node_total.append(source_node)
        err_msg = (f"Not all nodes are monitored by netchecker.\nExisted k8s nodes:\n{yaml.dump(k8s_nodes_name)}\n"
                   f"Source nodes from netchecker cnnc_total metric:\n{yaml.dump(source_node_total)}")
        if not set(source_node_total) == set(k8s_nodes_name):
            LOG.warning(err_msg)
            return False
        return True

    def check_metrics_increased(data_before_test, data_after_test):
        for node, targets1 in data_before_test.items():
            targets2 = data_after_test.get(node, [])
            counters1 = {entry['target_ip']: int(entry['counter']) for entry in targets1}
            counters2 = {entry['target_ip']: int(entry['counter']) for entry in targets2}

            for ip, counter1 in counters1.items():
                counter2 = counters2.get(ip)
                if counter2 is None:
                    LOG.warning(f"IP {ip} missing in data after test for node {node}")
                    return False
                if counter2 <= counter1:
                    LOG.warning(f"Counter for {ip} in node {node} did not increase. Before test: {counter1}. After "
                                f"test: {counter2}")
                    return False
        LOG.info("All counters are increased as expected")
        return True

    def group_metrics(metrics):
        metrics_stats_to_compare = {}
        for metric in metrics:
            source_node = metric.get('metric', {}).get('source_node', '')
            target_ip = metric.get('metric', {}).get('target_ip_address', '')
            value = metric.get('value', [])[1]
            if source_node not in metrics_stats_to_compare:
                metrics_stats_to_compare[source_node] = [{'target_ip': target_ip, 'counter': value}]
            else:
                metrics_stats_to_compare[source_node].append({'target_ip': target_ip, 'counter': value})
        return metrics_stats_to_compare

    LOG.info("Waiting for all source nodes are exists in metrics")
    waiters.wait(lambda: check_all_source_nodes_exists_in_metrics(cluster=target_cluster, query='cnnc_total'),
                 timeout=300, interval=30)
    LOG.info("All metrics are in place.")

    for pod in pods:
        LOG.info(f"Deleting pod: {pod.name} with pod_name_prefix: {pod_name_prefix}")
        pod.delete(timeout=180)
        check_pod_respawn(target_cluster, pods, pod_name_prefix)

    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods after: {[pod.name for pod in pods]}")
    LOG.info("Waiting for all source nodes are exists in metrics")
    waiters.wait(lambda: check_all_source_nodes_exists_in_metrics(cluster=target_cluster, query='cnnc_total'),
                 timeout=300, interval=30)
    # Get metrics and wait for they are increased
    metrics = target_cluster.prometheusclient.get_query(query='cnnc_total')
    metrics_to_compare = group_metrics(metrics)
    wait_for_metrics_increased_time = 300
    err_msg = (f"Some countres are not increased in {wait_for_metrics_increased_time} sec\nData before pods deletion:\n"
               f"{yaml.dump(metrics_to_compare)}\nData after pods deletion:\n"
               f"{yaml.dump(group_metrics(target_cluster.prometheusclient.get_query(query='cnnc_total')))}")
    waiters.wait(lambda: check_metrics_increased(data_before_test=metrics_to_compare,
                                                 data_after_test=group_metrics(
                                                     target_cluster.prometheusclient.get_query(
                                                         query='cnnc_total'))), timeout=300, interval=30,
                 timeout_msg=err_msg)


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
def test_ha_cnnc_inventory_agent_pods(target_cluster, cluster_condition_check, netchecker_cleanup_actions, _):
    """Delete cnnc-inventory-agent in any cluster one by one
     Precondition - all expected pods and their replicas must be presented
     The following scenario is executed for every namespace and
     expected pod entry in get_expected_pods. cnnc-inventory-agent is about
     inventory configuration in checkerinventoryconfig resource. So we need to check this resource after
     recreating agents

     Scenario:
         1. Compare actual number of replicas for pod with expected
         2. Create netchecker object
         3. Collect existing data for comparing from checkerinventory object
         4. Delete inventory agent one-by-one
         5. Wait till number of replicas will be restored
         6. Check chckerinventory config for correct data
         7. Check pods statuses in this group (Running and Ready)

     Expected result - pods are recreated, number of replicas is restored. Inventory is correct
     """

    netchecker = NetcheckerManager(target_cluster)
    if not netchecker.is_supported:
        pytest.skip(f"Netchecker is not supported on cluster {target_cluster.clusterrelease_version}")
    if not netchecker.is_enabled:
        pytest.skip("Netchecker is disabled. Check spec.providerSpec.value.disableNetChecker value")

    def _collect_inventory_from_inventory_config(netchecker_manager):
        inventory_conf_name = netchecker_manager.inventory_config_name
        inventory_conf = netchecker_manager.get_inventory_config(name=inventory_conf_name)
        assert inventory_conf, (f"Inventory config with name {inventory_conf_name} not found for cluster "
                                f"{netchecker_manager.cluster.name} in namespace "
                                f"{netchecker_manager.cluster.namespace}")
        data = inventory_conf.data
        return data.get('spec', {}), data.get('status', {})

    create_netchecker(netchecker_manager=netchecker)
    inventory_spec_before, inventory_status_before = _collect_inventory_from_inventory_config(netchecker)
    for item in inventory_status_before.get('nodes', []):
        item.pop('updatedAt')

    pod_name_prefix = 'cnnc-inventory-agent'
    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods before delete tests: {[pod.name for pod in pods]}")
    for pod in pods:
        LOG.info(f"Deleting pod: {pod.name} with pod_name_prefix: {pod_name_prefix}")
        pod.delete(timeout=180)
        check_pod_respawn(target_cluster, pods, pod_name_prefix)

    inventory_spec_after, inventory_status_after = _collect_inventory_from_inventory_config(netchecker)
    for item in inventory_status_after.get('nodes', []):
        item.pop('updatedAt')
    spec_diff = DeepDiff(inventory_spec_before, inventory_spec_after, ignore_order=True)
    status_diff = DeepDiff(inventory_status_before, inventory_status_after, ignore_order=True)
    err_msgs = []
    if spec_diff:
        err_msgs.append(f"Specs in inventory config were changed after inventory-agent pods restarted\nSpec before "
                        f"restart:\n{yaml.dump(inventory_spec_before)}\nSpec after restart:\n"
                        f"{yaml.dump(inventory_spec_after)}\nNext difference found: {spec_diff}\n")
    if status_diff:
        err_msgs.append(f"Status in inventory config were changed after inventory-agent pods restarted\nStatus before "
                        f"restart:\n{yaml.dump(inventory_status_before)}\nStatus after restart:\n"
                        f"{yaml.dump(inventory_status_after)}\nNext difference found: {status_diff}")
    if err_msgs:
        for msg in err_msgs:
            LOG.error(msg)
        raise AssertionError("Object was changed after pods restarted")

    LOG.info("All specs and status for netchecker inventory are correct")

    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods after: {[pod.name for pod in pods]}")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
def test_restart_inventory_agent_create_netchecker(target_cluster, cluster_condition_check,
                                                   netchecker_cleanup_actions, _):
    """Delete cnnc-inventory-agent cluster one by one
     Precondition - all expected pods and their replicas must be presented
     The following scenario is executed for every namespace and
     expected pod entry in get_expected_pods.

     Scenario:
         1. Compare actual number of replicas for pod with expected
         2. Delete inventory agent one-by-one
         3. Wait till number of replicas will be restored
         4. Create netchecker object and check statuses are ok

     Expected result - pods are recreated, number of replicas is restored. Netchecker created sucessfully after
     pods restarted
     """

    netchecker = NetcheckerManager(target_cluster)
    if not netchecker.is_supported:
        pytest.skip(f"Netchecker is not supported on cluster {target_cluster.clusterrelease_version}")
    if not netchecker.is_enabled:
        pytest.skip("Netchecker is disabled. Check spec.providerSpec.value.disableNetChecker value")
    pod_name_prefix = 'cnnc-inventory-agent'
    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods before delete tests: {[pod.name for pod in pods]}")
    for pod in pods:
        LOG.info(f"Deleting pod: {pod.name} with pod_name_prefix: {pod_name_prefix}")
        pod.delete(timeout=180)
        check_pod_respawn(target_cluster, pods, pod_name_prefix)
    pods = target_cluster.k8sclient.pods.list_starts_with(pod_name_prefix)
    LOG.info(f"Pods after: {[pod.name for pod in pods]}")
    LOG.info("Creating netchecker object after inventory agents are restarted")
    create_netchecker(netchecker_manager=netchecker)
    LOG.info("Netchecker created sucessfully after recreating inventory agents ")
