import pytest

from si_tests import settings
from si_tests import logger
from si_tests.utils import update_child_clusterrelease_actions
from si_tests.utils import update_release_names, exceptions
from si_tests.utils.utils import Provider

LOG = logger.logger


update_release_names = list(update_release_names.generate_update_release_names())
is_update_test_failed = False


@pytest.fixture(scope='function', params=update_release_names,
                ids=[f"RELEASE={x}" for x in update_release_names])
def update_release_name(request):
    global is_update_test_failed
    # Check if the previous update steps failed
    if is_update_test_failed:
        msg = (f"Skip updating clusterrelease to {request.param} because "
               f"previous update step failed")
        LOG.info(msg)
        pytest.skip(msg)

    yield request.param

    # Check the result of the current step
    test_passed = (hasattr(request.node, 'rep_call') and
                   request.node.rep_call.passed)
    if not test_passed:
        is_update_test_failed = True


"""
Child cluster requirements:
1. In child_data, ceph replicas size must be no more than "<storage_nodes_count> - <disabled_storage_nodes_count>",
   otherwise KaasCephCluster may show the following message during cluster upgrade:
    - object storage data pool with deviceClass 'hdd' and failureDomain 'host' has
      targeted to have '3' count of replicas/chunks, while in spec specified only
      '2' failureDomains 'host' with deviceClass 'hdd'
"""


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                               .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures("store_updated_child_cluster_description")
@pytest.mark.usefixtures("introspect_child_target_objects")
@pytest.mark.usefixtures('create_hoc_before_lcm_and_delete_after')
@pytest.mark.usefixtures("collect_downtime_statistics")     # Should be used if ALLOW_WORKLOAD == True
def test_update_child_clusterrelease_with_bad_machine_to_fix(kaas_manager, update_release_name, _):
    """Update child cluster release with a bad Machine in the cluster that may be disabled

    Simulate a broken Machine in the cluster, which can be disabled to complete the upgrade.
    Broken Machine will be fixed and enabled in another test.

    Scenario:
    1. Make a specified Machine broken (corrupt the LCM tasks in LCMMachine)
    2. Run Child cluster upgrade
    3. Wait until upgrade is stuck on the broken Machine
    4. Disable the broken Machine and evacuate SL and Ceph services from it
    5. Continue Child cluster upgrade
    6. Check cluster readiness (taking into account the disabled Machine)
    """

    cluster_name = settings.TARGET_CLUSTER
    namespace_name = settings.TARGET_NAMESPACE

    ns = kaas_manager.get_namespace(namespace_name)
    child_cluster = ns.get_cluster(cluster_name)
    cr_before = child_cluster.clusterrelease_version

    if update_release_name == cr_before:
        msg = (f"Requested {update_release_name} is the same as current "
               f"clusterrelease version {cr_before}, skipping update")
        LOG.info(msg)
        pytest.skip(msg)

    update_actions = update_child_clusterrelease_actions.UpdateChildClusterreleaseActions(child_cluster)
    update_actions.pre_update(update_release_name)

    # Save this flag to check the distribution after the cluster upgrade if required.
    # Flag may be enabled in this test under "f settings.KAAS_CHILD_CLUSTER_DISTRO_UPGRADE" above,
    # or may be enabled from an another job, so need to read it directly from the Child cluster.
    is_postpone_distribution_upgrade_enabled = child_cluster.is_postpone_distribution_upgrade_enabled

    LOG.info("Cluster release before update {0}".format(cr_before))
    LOG.banner(f"Updating child cluster {cluster_name} to the clusterrelease {update_release_name}")

    ######################################
    # Select a Machine to make it broken #
    ######################################
    target_machine = child_cluster.day2operations.get_machine_to_disable()
    child_cluster.day2operations.make_broken_lcm_for_day2_operations(
        machines=[target_machine], state_names=['deploy', 'reconfigure'])
    disabled_machine_name = target_machine.name
    disabled_k8s_node_name = target_machine.get_k8s_node_name()

    ##########################
    #  Start cluster update  #
    ##########################
    LOG.banner("Start cluster upgrade", sep='#')
    child_cluster.update_cluster(update_release_name)

    child_cluster.check.check_cluster_release(update_release_name)

    ##############################################################################
    # Expect LCMStuckException for the broken Machine while updating the cluster #
    ##############################################################################
    try:
        LOG.info(f"*** Expect LCM stuck for one of the following broken Machines: "
                 f"{disabled_machine_name}")
        child_cluster.check.check_update_finished(timeout=settings.KAAS_CHILD_CLUSTER_UPDATE_TIMEOUT, interval=120,
                                                  expected_stuck_machine_names=[disabled_machine_name])
        raise Exception(f"Update should be failed, but it is successfully finished with broken LCMMachines, "
                        f"please check the status of LCMMachine '{disabled_machine_name}'")
    except exceptions.LCMStuckException as e:
        LOG.info(f"Got the expected condition for the broken Machine: {e}")

    # 1. Check that target_machine is stuck or disabled
    assert target_machine.is_lcmmachine_stuck() or target_machine.is_disabled(), (
        f"Target Machine '{target_machine.name}' is still not stuck or disabled")

    # 2. Disable stuck machine and move Ceph/SL roles to another Machine
    child_cluster.day2operations.disable_machine(target_machine)

    # 3. Try to unbound Stacklight pods from disabled Machine
    child_cluster.day2operations.disable_stacklight_for_machine(disabled_machine_name, disabled_k8s_node_name)

    # 4. Try to remove stuck machine from KaasCephCluster and move Ceph roles on another Machines
    child_cluster.day2operations.disable_ceph_for_machine(disabled_machine_name, disabled_k8s_node_name)

    # 5. Cleanup pods and pvc from stuck Machine
    child_cluster.day2operations.cleanup_pods_and_pvc_from_k8s_node(disabled_k8s_node_name)

    LOG.banner(f"Machine '{target_machine.name}' was successfully disabled, continue cluster upgrade")
    child_cluster.check.check_update_finished(timeout=settings.KAAS_CHILD_CLUSTER_UPDATE_TIMEOUT, interval=120)

    LOG.info(f"Upgrade is completed for all cluster Machines except {disabled_machine_name}")

    #################################
    #  Cluster update is completed  #
    #################################
    LOG.banner("Cluster update is completed", sep="#")

    # Check the cluster readiness with 'disabled' Machines
    # Some checks (like check for 'reboot required' flag) are covered in other SI tests and skipped here

    LOG.banner("Check cluster Machines readiness")
    child_cluster.check.check_machines_status()

    LOG.banner("Check Cluster conditions readiness")
    child_cluster.day2operations.check_cluster_readiness(exp_provider_status=False,
                                                         timeout=settings.CHECK_CLUSTER_READINESS_TIMEOUT + 1800)
    LOG.banner("Check cluster nodes count")
    child_cluster.check.check_cluster_nodes()
    LOG.banner("Check cluster Pods")
    child_cluster.check.check_k8s_pods()
    LOG.banner("Check cluster Helmbundles")
    child_cluster.check.check_helmbundles()
    LOG.banner("Check cluster Nodes readiness")
    child_cluster.check.check_k8s_nodes()
    LOG.banner("Check cluster upgrade stages")
    if child_cluster.provider is Provider.vsphere and not settings.KAAS_VSPHERE_IPAM_ENABLED:
        child_cluster.check.check_upgrade_stage_success(skipped_stages_names='Network prepared')
    else:
        child_cluster.check.check_upgrade_stage_success()

    LOG.banner("Check actual/expected pods")
    child_cluster.check.check_actual_expected_pods()

    LOG.banner("Check leftovers after upgrade")
    if child_cluster.provider is not Provider.byo:
        child_cluster.check.check_no_leftovers_after_upgrade()

    LOG.banner("Check ceph pvc")
    if child_cluster.is_ceph_deployed:
        child_cluster.check.check_ceph_pvc()

    update_actions.reconfigure_coredns_for_mosk()

    LOG.banner("Check cluster k8s version")
    failed = {}
    for node in child_cluster.k8sclient.nodes.list_all():
        node_info = node.read().status.node_info
        if node_info.kubelet_version != node_info.kube_proxy_version:
            failed[node.name] = "kubelet_version {0} doesn't match " \
                                "kube_proxy_version {1} version".format(
                node_info.kubelet_version,
                node_info.kube_proxy_version)

    assert failed == {}, "k8s versions mismatch " \
                         "Details: {}".format(failed)

    if child_cluster.provider is Provider.baremetal:
        LOG.banner("Check repository url for child cluster")
        child_cluster.check.check_repository_url()
        LOG.banner("Check kernel versions")
        child_cluster.check.check_actual_expected_kernel_versions(is_postpone_distribution_upgrade_enabled)

    child_cluster.store_k8s_artifacts()
    child_cluster.provider_resources.save_artifact()


@pytest.mark.parametrize("_", ["CLUSTER_NAME={0}"
                               .format(settings.TARGET_CLUSTER)])
@pytest.mark.usefixtures("store_updated_child_cluster_description")
@pytest.mark.usefixtures("introspect_child_target_objects")
@pytest.mark.usefixtures('create_hoc_before_lcm_and_delete_after')
def test_update_child_clusterrelease_with_bad_machine_to_replace(kaas_manager, update_release_name, _):
    """Update child cluster release with a bad Machine in the cluster that must be replaced ASAP

    Simulate a broken Machine in the cluster, which blocks the upgrade until it is replaced.
    Broken Machine will be replaced, and upgrade will be completed in another test.

    Scenario:
    1. Make a specified Machine broken (corrupt the LCM tasks in LCMMachine)
    2. Run Child cluster upgrade
    3. Wait until upgrade is stuck on the broken Machine
    4. Disable the broken Machine
    """

    cluster_name = settings.TARGET_CLUSTER
    namespace_name = settings.TARGET_NAMESPACE

    ns = kaas_manager.get_namespace(namespace_name)
    child_cluster = ns.get_cluster(cluster_name)
    cr_before = child_cluster.clusterrelease_version

    if update_release_name == cr_before:
        msg = (f"Requested {update_release_name} is the same as current "
               f"clusterrelease version {cr_before}, skipping update")
        LOG.info(msg)
        pytest.skip(msg)

    LOG.info("Cluster release before update {0}".format(cr_before))
    LOG.banner(f"Updating child cluster {cluster_name} to the clusterrelease {update_release_name}")

    ######################################
    # Select a Machine to make it broken #
    ######################################
    target_machine = child_cluster.day2operations.get_machine_to_disable()
    child_cluster.day2operations.make_broken_lcm_for_day2_operations(
        machines=[target_machine], state_names=['deploy', 'reconfigure'])
    disabled_machine_name = target_machine.name

    ##########################
    #  Start cluster update  #
    ##########################
    LOG.banner("Start cluster upgrade", sep='#')
    child_cluster.update_cluster(update_release_name)

    child_cluster.check.check_cluster_release(update_release_name)

    ##############################################################################
    # Expect LCMStuckException for the broken Machine while updating the cluster #
    ##############################################################################
    try:
        LOG.info(f"*** Expect LCM stuck for one of the following broken Machines: "
                 f"{disabled_machine_name}")
        child_cluster.check.check_update_finished(timeout=settings.KAAS_CHILD_CLUSTER_UPDATE_TIMEOUT, interval=120,
                                                  expected_stuck_machine_names=[disabled_machine_name])
        raise Exception(f"Update should be failed, but it is successfully finished with broken LCMMachines, "
                        f"please check the status of LCMMachine '{disabled_machine_name}'")
    except exceptions.LCMStuckException as e:
        LOG.info(f"Got the expected condition for the broken Machine: {e}")

    # 1. Check that target_machine is stuck or disabled
    assert target_machine.is_lcmmachine_stuck() or target_machine.is_disabled(), (
        f"Target Machine '{target_machine.name}' is still not stuck or disabled")

    # 2. Disable stuck machine
    child_cluster.day2operations.disable_machine(target_machine)

    # TODO(ddmitriev): use some check to ensure that operation is actually stuck
    LOG.banner("Stop waiting for cluster upgrade. Need to replace the disabled Machines ASAP using another test")
