import pytest

from si_tests import logger
from si_tests.managers.kaas_manager import Cluster
from si_tests.managers.kaas_manager import Manager
from si_tests import settings
from si_tests.clients.k8s.pods import K8sPod


LOG = logger.logger


def is_cluster_management():
    cluster_name = settings.TARGET_CLUSTER
    namespace_name = settings.TARGET_NAMESPACE

    ns = Manager(settings.KUBECONFIG_PATH).get_namespace(namespace_name)
    cluster = ns.get_cluster(cluster_name)
    return cluster.is_management


cluster_is_management = is_cluster_management()


cluster_is_child = not cluster_is_management


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_kill_patroni(target_cluster: Cluster, cluster_condition_check, _):
    """Kill patroni service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    Scenario:
        1. SSH to machine
        2. Kill patroni-13 service
        3. Check patroni-13 become ready
        4. Check that all pods are Running and Ready

    Expected result - all patroni services restored successfully.
    """
    machines = target_cluster.get_machines(k8s_labels={"stacklight": "enabled"})
    for machine in machines:
        LOG.info(f"Accessing {machine.name}")
        containers = machine._run_cmd(
            "sudo crictl ps --name ^patroni$ | grep -v CONTAINER | cut -d ' ' -f 1",
            ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
        for cont_id in containers.splitlines():
            pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                         cont_id,
                                                                                         process_name='postgres')
            machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                             ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE,
                             check_exit_code=False)
            LOG.info(f'Processes in container: {cont_id} with pids: {pids_to_kill} has been killed')
            LOG.info("Waiting for pods to be in a correct state")
            target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")

        containers = machine._run_cmd(
            "sudo crictl ps --name ^patroni$ | grep -v CONTAINER | cut -d ' ' -f 1",
            ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
        for cont_id in containers.splitlines():
            pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                         cont_id,
                                                                                         process_name='runsvdir')
            machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                             ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE,
                             check_exit_code=False)
            LOG.info("Waiting for pods to be in a correct state")
            target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_fluentd_logs(target_cluster: Cluster, cluster_condition_check, _):
    """Check fluentd logs

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

        Scenario:
            1. SSH to machine
            2. Kill fluentd logs service
            3. Check fluentd logs become ready
            4. Check that all pods are Running and Ready

    Expected result - fluentd logs are empty.
    """
    machines = target_cluster.get_machines(k8s_labels={"stacklight": "enabled"})
    for machine in machines:
        LOG.info(f"Accessing {machine.name}")
        containers = machine._run_cmd(
            "sudo crictl ps --name ^fluentd-logs$ | grep -v CONTAINER | cut -d ' ' -f 1",
            ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
        for cont_id in containers.splitlines():
            pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                         cont_id,
                                                                                         process_name='ruby')
            machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                             ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
            LOG.info("Waiting for pods to be in a correct state")
            target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_cadvisor(target_cluster: Cluster, cluster_condition_check, _):
    """Kill cadvisor service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    Scenario:
        1. SSH to machine
        2. Kill cadvisor service
        3. Check cadvisor become ready
        4. Check that all pods are Running and Ready

    Expected result - all cadvisor services restored successfully.
    """
    machines = target_cluster.get_machines(k8s_labels={"stacklight": "enabled"})
    for machine in machines:
        LOG.info(f"Accessing {machine.name}")
        containers = machine._run_cmd(
            "sudo crictl ps --name ^cadvisor$ | grep -v CONTAINER | cut -d ' ' -f 1",
            ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
        for cont_id in containers.splitlines():
            pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                         cont_id,
                                                                                         process_name='cadvisor')
            machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                             ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
            LOG.info("Waiting for pods to be in a correct state")
            target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_alerta(target_cluster: Cluster, cluster_condition_check, _):
    """Kill alerta service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    Scenario:
        1. SSH to machine
        2. Kill alerta service
        3. Check alerta become ready
        4. Check that all pods are Running and Ready

    Expected result - all alerta services restored successfully.
    """
    pods: list[K8sPod] = target_cluster.k8sclient.pods.list(namespace="stacklight",
                                                            name_prefix="alerta")
    assert pods, "No alerta pods found"

    for pod in pods:
        machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
        LOG.info(f"Accessing {machine.name}")
        containers = machine._run_cmd(
            "sudo crictl ps --name ^alerta$ | grep -v CONTAINER | cut -d ' ' -f 1",
            ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
        for cont_id in containers.splitlines():
            pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                         cont_id,
                                                                                         process_name='nginx')
            machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                             ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
            LOG.info("Waiting for pods to be in a correct state")
            target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")

        containers = machine._run_cmd(
            "sudo crictl ps --name ^alerta$ | grep -v CONTAINER | cut -d ' ' -f 1",
            ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
        for cont_id in containers.splitlines():
            pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                         cont_id,
                                                                                         process_name='uwsgi')
            machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                             ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
            LOG.info("Waiting for pods to be in a correct state")
            target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")

    # Update pods list after killing alerta nginx service to perform uwsgi restart check
    pods: list[K8sPod] = target_cluster.k8sclient.pods.list(namespace="stacklight",
                                                            name_prefix="alerta")
    for pod in pods:

        machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
        LOG.info(f"Accessing {machine.name}")

        containers = machine._run_cmd(
            "sudo crictl ps --name ^alerta$ | grep -v CONTAINER | cut -d ' ' -f 1",
            ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
        for cont_id in containers.splitlines():
            pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                         cont_id,
                                                                                         process_name='uwsgi')
            machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                             ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
            LOG.info("Waiting for pods to be in a correct state")
            target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_opensearch_master(target_cluster: Cluster, cluster_condition_check, _):
    """Kill opensearch master service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    Scenario:
        1. SSH to machine
        2. Kill opensearch master service
        3. Check opensearch master become ready
        4. Check that all pods are Running and Ready

    Expected result - all opensearch master services restored successfully.
    """
    machines = target_cluster.get_machines(k8s_labels={"stacklight": "enabled"})
    for machine in machines:
        LOG.info(f"Accessing {machine.name}")
        containers = machine._run_cmd(
            "sudo crictl ps --name ^opensearch$ | grep -v CONTAINER | cut -d ' ' -f 1",
            ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
        for cont_id in containers.splitlines():
            pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                         cont_id,
                                                                                         process_name='java')
            machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                             ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
            LOG.info("Waiting for pods to be in a correct state")
            target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_machine_exporter(target_cluster: Cluster, cluster_condition_check, _):
    """Kill prometheus machine exporter service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    Scenario:
        1. SSH to machine
        2. Kill prometheus machine exporter service
        3. Check prometheus machine exporter become ready
        4. Check that all pods are Running and Ready

    Expected result - all prometheus machine exporter services restored successfully.
    """
    machines = target_cluster.get_machines(k8s_labels={"stacklight": "enabled"})
    for machine in machines:
        LOG.info(f"Accessing {machine.name}")
        containers = machine._run_cmd(
            "sudo crictl ps --name ^prometheus-node-exporter$ | grep -v CONTAINER | cut -d ' ' -f 1",
            ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
        for cont_id in containers.splitlines():
            pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                         cont_id,
                                                                                         process_name='node_exporter')
            machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                             ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
            LOG.info("Waiting for pods to be in a correct state")
            target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


# telegraf-ds-smart - checker done
@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_telegraf_ds_smart(target_cluster: Cluster, cluster_condition_check, _):
    """Kill telegraf ds smart service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    Scenario:
        1. SSH to machine
        2. Kill telegraf ds smart service
        3. Check telegraf ds smart become ready
        4. Check that all pods are Running and Ready

    Expected result - all telegraf ds smart services restored successfully.
    """
    machines = target_cluster.get_machines(k8s_labels={"stacklight": "enabled"})
    for machine in machines:
        LOG.info(f"Accessing {machine.name}")
        containers = machine._run_cmd(
            "sudo crictl ps --name ^telegraf-ds-smart$ | grep -v CONTAINER | cut -d ' ' -f 1",
            ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
        for cont_id in containers.splitlines():
            pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                         cont_id,
                                                                                         process_name='telegraf')
            machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                             ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
            LOG.info("Waiting for pods to be in a correct state")
            target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.skipif(cluster_is_child, reason="We should skip current HA test for child cluster")
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_telemeter_server(target_cluster: Cluster, cluster_condition_check, _):
    """Kill telemeter server service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    telemeter-server pod is available only on mgmt cluster

    Scenario:
        1. SSH to machine
        2. Kill telemeter server service
        3. Check telemeter server become ready
        4. Check that all pods are Running and Ready

    Expected result - all telemeter server services restored successfully.
    """
    pods = target_cluster.k8sclient.pods.list(namespace="stacklight", name_prefix="telemeter-server")
    assert pods is not None, "No telemeter-server pods found"
    pod = pods[0]

    machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
    LOG.info(f"Accessing {machine.name}")
    containers = machine._run_cmd(
        "sudo crictl ps --name ^telemeter-server$ | grep -v CONTAINER | cut -d ' ' -f 1",
        ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
    for cont_id in containers.splitlines():
        pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                     cont_id,
                                                                                     process_name='telemeter-server')
        machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                         ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        LOG.info("Waiting for pods to be in a correct state")
        target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_telegraf_docker_swarm(target_cluster: Cluster, cluster_condition_check, _):
    """Kill telegraf docker swarm service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    Scenario:
        1. SSH to machine
        2. Kill telegraf docker swarm service
        3. Check telegraf docker swarm become ready
        4. Check that all pods are Running and Ready

    Expected result - all telegraf docker swarm services restored successfully.
    """
    pods = target_cluster.k8sclient.pods.list(namespace="stacklight", name_prefix="telegraf-docker-swarm")
    assert pods is not None, "No telegraf-docker-swarm pods found"
    pod = pods[0]

    machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
    LOG.info(f"Accessing {machine.name}")
    containers = machine._run_cmd(
        "sudo crictl ps --name ^telegraf-docker-swarm$ | grep -v CONTAINER | cut -d ' ' -f 1",
        ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
    for cont_id in containers.splitlines():
        pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                     cont_id,
                                                                                     process_name='telegraf')
        machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                         ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        LOG.info("Waiting for pods to be in a correct state")
        target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_prometheus_relay(target_cluster: Cluster, cluster_condition_check, _):
    """Kill prometheus relay service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    Scenario:
        1. SSH to machine
        2. Kill prometheus relay service
        3. Check prometheus relay become ready
        4. Check that all pods are Running and Ready

    Expected result - all prometheus relay services restored successfully.
    """
    pods = target_cluster.k8sclient.pods.list(namespace="stacklight", name_prefix="prometheus-relay")
    assert pods is not None, "No prometheus-relay pods found"
    pod = pods[0]

    machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
    LOG.info(f"Accessing {machine.name}")
    containers = machine._run_cmd(
        "sudo crictl ps --name ^prometheus-relay$ | grep -v CONTAINER | cut -d ' ' -f 1",
        ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
    for cont_id in containers.splitlines():
        pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                     cont_id,
                                                                                     process_name='prometheus-rela')
        machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                         ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        LOG.info("Waiting for pods to be in a correct state")
        target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_prometheus_kube_state_metrics(target_cluster: Cluster, cluster_condition_check, _):
    """Kill prometheus kube state metrics service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    Scenario:
        1. SSH to machine
        2. Kill prometheus kube state metrics service
        3. Check prometheus kube state metrics become ready
        4. Check that all pods are Running and Ready

    Expected result - all prometheus kube state metrics services restored successfully.
    """
    pods = target_cluster.k8sclient.pods.list(namespace="stacklight", name_prefix="prometheus-kube-state-metrics")
    assert pods is not None, "No prometheus-kube-state-metrics pods found"
    pod = pods[0]

    machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
    LOG.info(f"Accessing {machine.name}")
    containers = machine._run_cmd(
        "sudo crictl ps --name ^prometheus-kube-state-metrics$ | grep -v CONTAINER | cut -d ' ' -f 1",
        ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
    for cont_id in containers.splitlines():
        pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                     cont_id,
                                                                                     process_name='kube-state-metr')
        machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                         ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        LOG.info("Waiting for pods to be in a correct state")
        target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_prometheus_es_exporter(target_cluster: Cluster, cluster_condition_check, _):
    """Kill prometheus es exporter service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    Scenario:
        1. SSH to machine
        2. Kill prometheus es exporter service
        3. Check prometheus es exporter become ready
        4. Check that all pods are Running and Ready

    Expected result - all prometheus es exporter services restored successfully.
    """
    pods = target_cluster.k8sclient.pods.list(namespace="stacklight", name_prefix="prometheus-es-exporter")
    assert pods is not None, "No prometheus-es-exporter pods found"
    pod = pods[0]

    machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
    LOG.info(f"Accessing {machine.name}")
    containers = machine._run_cmd(
        "sudo crictl ps --name ^prometheus-es-exporter$ | grep -v CONTAINER | cut -d ' ' -f 1",
        ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
    for cont_id in containers.splitlines():
        pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                     cont_id,
                                                                                     process_name='python')
        machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                         ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        LOG.info("Waiting for pods to be in a correct state")
        target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_prometheus_blackbox_exporter(target_cluster: Cluster, cluster_condition_check, _):
    """Kill prometheus blackbox exporter service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    Scenario:
        1. SSH to machine
        2. Kill prometheus blackbox exporter service
        3. Check prometheus blackbox exporter become ready
        4. Check that all pods are Running and Ready

    Expected result - all prometheus blackbox exporter services restored successfully.
    """
    pods = target_cluster.k8sclient.pods.list(namespace="stacklight", name_prefix="prometheus-blackbox-exporter")
    assert pods is not None, "No prometheus-blackbox-exporter pods found"
    pod = pods[0]

    machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
    LOG.info(f"Accessing {machine.name}")
    containers = machine._run_cmd(
        "sudo crictl ps --name ^prometheus-node-exporter$ | grep -v CONTAINER | cut -d ' ' -f 1",
        ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
    for cont_id in containers.splitlines():
        pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                     cont_id,
                                                                                     process_name='node_exporter')
        machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                         ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        LOG.info("Waiting for pods to be in a correct state")
        target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_elasticsearch_exporter(target_cluster: Cluster, cluster_condition_check, _):
    """Kill elasticsearch exporter service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    Scenario:
        1. SSH to machine
        2. Kill elasticsearch exporter service
        3. Check elasticsearch exporter become ready
        4. Check that all pods are Running and Ready

    Expected result - all elasticsearch exporter services restored successfully.
    """
    pods = target_cluster.k8sclient.pods.list(namespace="stacklight", name_prefix="elasticsearch-exporter")
    assert pods is not None, "No elasticsearch-exporter pods found"
    pod = pods[0]

    machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
    LOG.info(f"Accessing {machine.name}")
    containers = machine._run_cmd(
        "sudo crictl ps --name ^elasticsearch-exporter$ | grep -v CONTAINER | cut -d ' ' -f 1",
        ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
    for cont_id in containers.splitlines():
        pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                     cont_id,
                                                                                     process_name='elasticsearch_e')
        machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                         ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        LOG.info("Waiting for pods to be in a correct state")
        target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_opensearch_dashboards(target_cluster: Cluster, cluster_condition_check, _):
    """Kill opensearch dashboards service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    Scenario:
        1. SSH to machine
        2. Kill opensearch dashboards service
        3. Check opensearch dashboards become ready
        4. Check that all pods are Running and Ready

    Expected result - all opensearch dashboards services restored successfully.
    """
    pods = target_cluster.k8sclient.pods.list(namespace="stacklight", name_prefix="opensearch-dashboards")
    assert pods is not None, "No opensearch-dashboards pods found"
    pod = pods[0]

    machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
    LOG.info(f"Accessing {machine.name}")
    containers = machine._run_cmd(
        "sudo crictl ps --name ^opensearch-dashboards$ | grep -v CONTAINER | cut -d ' ' -f 1",
        ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
    for cont_id in containers.splitlines():
        pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                     cont_id,
                                                                                     process_name='node')
        machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                         ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        LOG.info("Waiting for pods to be in a correct state")
        target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_metricbeat(target_cluster: Cluster, cluster_condition_check, _):
    """Kill metricbeat service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine

    Scenario:
        1. SSH to machine
        2. Kill metricbeat service
        3. Check metricbeat become ready
        4. Check that all pods are Running and Ready

    Expected result - all metricbeat services restored successfully.
    """
    pods = target_cluster.k8sclient.pods.list(namespace="stacklight", name_prefix="metricbeat")
    assert pods is not None, "No metricbeat pods found"
    pod = pods[0]

    machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
    LOG.info(f"Accessing {machine.name}")
    containers = machine._run_cmd(
        "sudo crictl ps --name ^metricbeat$ | grep -v CONTAINER | cut -d ' ' -f 1",
        ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
    for cont_id in containers.splitlines():
        pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                     cont_id,
                                                                                     process_name='tini')
        machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                         ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        LOG.info("Waiting for pods to be in a correct state")
        target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")

    containers = machine._run_cmd(
        "sudo crictl ps --name ^metricbeat$ | grep -v CONTAINER | cut -d ' ' -f 1",
        ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
    for cont_id in containers.splitlines():
        pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                     cont_id,
                                                                                     process_name='metricbeat')
        machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                         ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        LOG.info("Waiting for pods to be in a correct state")
        target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.skipif(cluster_is_child, reason="We should skip current HA test for child cluster")
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_metric_collector(target_cluster: Cluster, cluster_condition_check, _):
    """Kill metric collector service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine
    Scenario:
        1. SSH to machine
        2. Kill metric collector service
        3. Check metric collector become ready
        4. Check that all pods are Running and Ready
    Expected result - all metric collector services restored successfully.

    This pod is available only on mgmt
    """
    pods = target_cluster.k8sclient.pods.list(namespace="stacklight", name_prefix="metric-collector")[0]
    assert pods is not None, "No metric-collector pods found"
    pod = pods[0]

    machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
    LOG.info(f"Accessing {machine.name}")
    containers = machine._run_cmd(
        "sudo crictl ps --name ^metric-collector$ | grep -v CONTAINER | cut -d ' ' -f 1",
        ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
    for cont_id in containers.splitlines():
        pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(
            machine,
            cont_id,
            process_name='mcc-metric-collector')

        machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                         ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        LOG.info("Waiting for pods to be in a correct state")
        target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_grafana(target_cluster: Cluster, cluster_condition_check, _):
    """Kill grafana service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine
    Scenario:
        1. SSH to machine
        2. Kill grafana service
        3. Check grafana become ready
        4. Check that all pods are Running and Ready
    Expected result - all grafana services restored successfully.
    """
    pods = target_cluster.k8sclient.pods.list(namespace="stacklight", name_prefix="grafana")
    assert pods is not None, "No grafana pods found"
    pod = pods[0]

    machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
    LOG.info(f"Accessing {machine.name}")
    containers = machine._run_cmd(
        "sudo crictl ps --name ^grafana$ | grep -v CONTAINER | cut -d ' ' -f 1",
        ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
    for cont_id in containers.splitlines():
        pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                     cont_id,
                                                                                     process_name='grafana')
        machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                         ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
        LOG.info("Waiting for pods to be in a correct state")
        target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_prometheus_alertmanager(target_cluster: Cluster, cluster_condition_check, _):
    """Kill prometheus alertmanager service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine
    Scenario:
        1. SSH to machine
        2. Kill prometheus alertmanager service
        3. Check prometheus alertmanager become ready
        4. Check that all pods are Running and Ready
    Expected result - all prometheus alertmanager services restored successfully.
    """
    for pod_name in ('prometheus-alertmanager-0', "prometheus-alertmanager-1"):
        pod: K8sPod = target_cluster.k8sclient.pods.get(pod_name, namespace="stacklight")
        machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
        LOG.info(f"Accessing {machine.name}")
        containers = machine._run_cmd(
            "sudo crictl ps --name ^prometheus-alertmanager$ | grep -v CONTAINER | cut -d ' ' -f 1",
            ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
        for cont_id in containers.splitlines():
            pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                         cont_id,
                                                                                         process_name='alertmanager')
            machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                             ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
            LOG.info("Waiting for pods to be in a correct state")
            target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                         .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures('log_method_time')
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keystone')
@pytest.mark.usefixtures('mcc_loadtest_kibana')
@pytest.mark.usefixtures('mcc_loadtest_alertmanager')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
def test_ha_prometheus_server(target_cluster: Cluster, cluster_condition_check, _):
    """Kill prometheus server service on every cluster machine (one at a time)

    Precondition - all expected pods and their replicas must be presented
    The following scenario is executed for every machine
    Scenario:
        1. SSH to machine
        2. Kill prometheus server service
        3. Check prometheus server become ready
        4. Check that all pods are Running and Ready
    Expected result - all prometheus server services restored successfully.
    """
    for pod_name in ('prometheus-server-0', "prometheus-server-1"):
        pod: K8sPod = target_cluster.k8sclient.pods.get(pod_name, namespace="stacklight")
        machine = target_cluster.get_machine_by_k8s_name(pod.node_name)
        LOG.info(f"Accessing {machine.name}")
        containers = machine._run_cmd(
            "sudo crictl ps --name ^prometheus-server$ | grep -v CONTAINER | cut -d ' ' -f 1",
            ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE).stdout_str
        for cont_id in containers.splitlines():
            pids_to_kill = target_cluster.ha.pids_of_process_in_container_for_containerd(machine,
                                                                                         cont_id,
                                                                                         process_name='prometheus')
            machine._run_cmd(f"sudo kill -9 {pids_to_kill}",
                             ssh_key=settings.HA_TEST_PRIVATE_KEY_FILE)
            LOG.info("Waiting for pods to be in a correct state")
            target_cluster.check.check_k8s_pods(timeout=1200, interval=30, target_namespaces="stacklight")
