import pytest
from si_tests import logger
from si_tests import settings

from si_tests.deployments.utils import kubectl_utils
from si_tests.utils import exceptions, waiters

LOG = logger.logger


def get_reboot_warning(m_list, namespace_name):
    """Get reboot warning
    Get nodes warning from cli
    we need check that kubectl display warning for nodes (reboot for example)
    """
    kubectl = kubectl_utils.Kubectl()
    nodes_warning = {}
    out = kubectl.get('machines', '-o yaml', namespace_name).result_yaml
    for i in out['items']:
        name = i['metadata']['name']
        warning = i['status']['providerStatus'].get('warnings', [])
        if name in m_list and not nodes_warning.get(name):
            nodes_warning[name] = ("Scheduled for a reboot" in warning or "Reboot is in progress" in warning)
    LOG.info("machines without reboot warning:\n" + str([k for k, v in nodes_warning.items() if not v]))
    return all(nodes_warning.values())


def get_node_os_and_current_expected_kernel(cluster):
    expected_versions_map_current = cluster.get_expected_kernel_version()
    nodes_os_and_kernel = cluster.get_nodes_kernel_and_tarfs_versions()
    latest_kernels = {
        i["os_version"]: i["kernel"]
        for i in expected_versions_map_current["allowedDistributions"]
        if not i.get("notgreenfield")
    }
    for node, items in nodes_os_and_kernel.items():
        os_version = items['os_version']
        items.pop('dib_datetime', None)
        items['expected_kernel'] = latest_kernels.get(os_version, None)

    return nodes_os_and_kernel


@pytest.mark.usefixtures("introspect_distribution_not_changed")
@pytest.mark.usefixtures("collect_downtime_statistics")     # Should be used if ALLOW_WORKLOAD == True
@pytest.mark.parametrize("_", [f"CLUSTER_NAME={settings.TARGET_CLUSTER}"])
@pytest.mark.usefixtures('create_hoc_before_lcm_and_delete_after')
def test_rolling_reboot_machines_with_bad_machine(kaas_manager, _, show_step):
    """Graceful reboot of all machines of the cluster, which contains a broken Machine which stuck on reboot

    Scenario:
        1. Check init state cluster
        2. Make a cluster Machine broken (should not start docker and lcm-agent after reboot)
        3. Create Graceful Reboot Request
        4. Wait for nodes warning: 'Scheduled for a reboot' or 'Reboot is in progress'
        5. Wait until reboot is stuck on the broken Machine
        6. Disable the broken Machine
        7. Evacuate SL and Ceph services from disabled Machine
        8. Ensure that GracefulReboot request is completed, or apply W/A for PRODX-42401
        9. Check Cluster and nodes status
    """
    cluster_name = settings.TARGET_CLUSTER
    namespace_name = settings.TARGET_NAMESPACE
    ns = kaas_manager.get_namespace(namespace_name)
    cluster = ns.get_cluster(cluster_name)

    show_step(1)
    LOG.info(f"Check init state on the {cluster._cluster_type} cluster {cluster.namespace}/{cluster.name}")
    cluster.check.check_machines_status()
    cluster.check.check_cluster_readiness()
    cluster.check.check_k8s_nodes()
    cluster.check.wait_graceful_reboot_request(expected_status=False)

    #####################################
    # Select machines to make them broken
    #####################################
    show_step(2)
    target_machine = cluster.day2operations.get_machine_to_disable()
    disabled_machine_name = target_machine.name
    disabled_k8s_node_name = target_machine.get_k8s_node_name()
    LOG.banner(f"Expected bad machine: '{disabled_machine_name}'")

    cluster.day2operations.make_broken_reboot_for_day2_operations(target_machine)

    show_step(3)
    LOG.info(f"Creating Graceful Reboot Request for all machines in cluster {cluster.namespace}/{cluster.name}")
    m_list = cluster.get_machines_names()
    ns.create_gracefulrebootrequest_object(cluster.name, namespace_name, m_list)
    show_step(4)
    waiters.wait(
        lambda: get_reboot_warning(m_list, namespace_name),
        timeout=3600, interval=10,
        timeout_msg="Wait for 'reboot' warning for all machines")

    cluster.check.wait_graceful_reboot_request(expected_status=True)

    show_step(5)
    LOG.info('Waiting for all machines to reboot')
    boot_time_dict = cluster.get_boot_time_dict(exclude_bastion=True)
    machines_number = len(boot_time_dict.keys())
    # Rebooting BM machines takes about 10-15 minutes, but sometimes may take 25+ minutes
    machines_reboot_timeout = 1800 * machines_number

    ####################################################################################
    # Expect LCMStuckException for the broken Machine while GracefulReboot is executed #
    ####################################################################################
    try:
        LOG.info(f"*** Expect LCM stuck for one of the following broken Machines: "
                 f"{disabled_machine_name}")
        cluster.check.wait_machines_reboot(boot_time_dict, timeout=machines_reboot_timeout,
                                           expected_stuck_machine_names=[disabled_machine_name])
        raise Exception(f"GracefulReboot should be failed, but it is successfully finished with broken LCMMachines, "
                        f"please check the status of LCMMachine '{disabled_machine_name}'")
    except exceptions.LCMStuckException as e:
        LOG.info(f"Got the expected condition for the broken Machine: {e}")

    # 1. Check that target_machine is stuck or disabled
    assert target_machine.is_lcmmachine_stuck() or target_machine.is_disabled(), (
        f"Target Machine '{target_machine.name}' is still not stuck or disabled")

    # For the future: restore docker service autostart, so it would working when Machine will be enabled in other test
    cluster.day2operations.fix_broken_reboot_for_day2_operations(machines=[target_machine], start_services=False)

    show_step(6)
    # 2. Disable stuck machine and move Ceph/SL roles to another Machine
    cluster.day2operations.disable_machine(target_machine)

    if cluster.workaround.prodx_42401():
        gracefulrebootrequest = cluster.get_gracefulrebootrequest()
        if gracefulrebootrequest:
            LOG.banner("W/A(PRODX-42401): Delete GracefulRebootRequest before moving SL/Ceph labels")
            gracefulrebootrequest.delete()

    show_step(7)
    # 3. Try to unbound Stacklight pods from disabled Machine
    cluster.day2operations.disable_stacklight_for_machine(disabled_machine_name, disabled_k8s_node_name)

    # 4. Try to remove stuck machine from KaasCephCluster and move Ceph roles on another Machines
    cluster.day2operations.disable_ceph_for_machine(disabled_machine_name, disabled_k8s_node_name)

    # 5. Cleanup pods and pvc from stuck Machine
    cluster.day2operations.cleanup_pods_and_pvc_from_k8s_node(disabled_k8s_node_name)

    show_step(8)
    if not cluster.workaround.prodx_42401():
        # Should be a main step after 2.27 is released
        LOG.banner(f"Machine '{target_machine.name}' was successfully disabled, continue with GracefulReboot")
        cluster.check.wait_machines_reboot(boot_time_dict, timeout=machines_reboot_timeout)

    LOG.info(f"Check cluster {cluster.namespace}/{cluster.name} reboot request is completed")
    cluster.check.wait_graceful_reboot_request(expected_status=False, timeout=600)
    ##########################################
    #  Cluster machines reboot is completed  #
    ##########################################
    LOG.banner("GracefulReboot is completed", sep="#")

    show_step(9)
    LOG.banner("Check cluster Machines readiness")
    cluster.check.check_machines_status()
    LOG.banner("Check Cluster conditions readiness")
    cluster.day2operations.check_cluster_readiness(exp_provider_status=False,
                                                   timeout=settings.CHECK_CLUSTER_READINESS_TIMEOUT + 1800)
    LOG.banner("Check cluster nodes count")
    cluster.check.check_cluster_nodes()
    LOG.banner("Check cluster Pods")
    cluster.check.check_k8s_pods()
    LOG.banner("Check cluster Nodes readiness")
    cluster.check.check_k8s_nodes()
