import copy
import time
import pytest

from si_tests import logger
from si_tests import settings
from si_tests.managers.kaas_manager import Machine, Cluster, Manager  # noqa: F401
from si_tests.managers.openstack_manager import OpenStackManager
from si_tests.utils import waiters
from si_tests.utils import exceptions
from si_tests.utils import packaging_version as version
from si_tests.tests.lcm.test_replace_bm_master_node import (
    wait_ceph_status,
    is_precaching_required)

LOG = logger.logger


def get_ironic_pods(cluster: Cluster):
    return cluster.k8sclient.pods.list(namespace="kaas", name_prefix='ironic')


def replace_with_ironic(cluster: Cluster, replace_node_k8_name):
    ironic_k8s_node_name = get_ironic_pods(cluster)[0].node_name
    ironic_flag = ironic_k8s_node_name == replace_node_k8_name
    LOG.info(f"replace node with ironic: {ironic_flag}")
    return ironic_flag


def check_deployments(dp):
    LOG.info("Wait deployments ready(with replicas = 1)")
    for d in dp:
        if not d.ready:
            return False
    return True


# Add additional machine statuses to see if at least 2 nodes are ready.
# https://mirantis.jira.com/browse/PRODX-33984
def wait_machines_status_ready(cluster: Cluster):
    control_machines = cluster.get_machines(machine_type='control')
    statuses = {machine.name: (machine.machine_status,
                               machine.data.get('status', {}).get('providerStatus', {}).get('ready'))
                for machine in control_machines}
    LOG.info("Wait until at least 2 machines ready.\nCluster status: {}\nCurrent machines statuses: \n{}".format(
        cluster.cluster_status,
        '\n'.join('{} - Status={} Ready={}'.format(k, *v) for k, v in statuses.items())
    ))
    ready_machines = sum([v for k, v in statuses.values()])
    LOG.info("Ready {} of {} machine(s)".format(ready_machines, len(control_machines)))
    return ready_machines >= 2


@pytest.mark.usefixtures("introspect_distribution_not_changed")
@pytest.mark.usefixtures("collect_downtime_statistics")     # Should be used if ALLOW_WORKLOAD == True
@pytest.mark.usefixtures('mcc_loadtest_prometheus')
@pytest.mark.usefixtures('mcc_loadtest_grafana')
@pytest.mark.usefixtures('mcc_loadtest_alerta')
@pytest.mark.usefixtures('mcc_loadtest_keycloak')
@pytest.mark.usefixtures('create_hoc_before_lcm_and_delete_after')
def test_replace_broken_drive_master_node(kaas_manager: Manager, show_step):
    """Replace BM control plane node.

    Scenario:
        1. Find master node where VIP hosted
        2. Fill disk of selected master node
        3. Replace master Machine
        4. Add new node to the Ceph cluster
        5. Delete all Pending openstack pvc for pods (optional)
        6. Check OpenStack readiness (optional)
        7. Check cluster readiness
    """

    cluster_name = settings.TARGET_CLUSTER
    namespace_name = settings.TARGET_NAMESPACE

    ns = kaas_manager.get_namespace(namespace_name)
    LOG.info("Namespace name - %s", namespace_name)
    cluster = ns.get_cluster(cluster_name)
    LOG.info("Cluster name - %s", cluster_name)

    show_step(1)
    master = cluster.get_keepalive_master_machine()
    check_precaching = is_precaching_required(master)
    k8s_node_name = master.get_k8s_node_name()
    if not cluster.is_child and replace_with_ironic(cluster, k8s_node_name):
        ironic_flag = True
        LOG.info(f"ironic_flag = {ironic_flag}")
    else:
        ironic_flag = False
        LOG.info(f"ironic_flag = {ironic_flag}")

    show_step(2)
    LOG.info(f"Fill disk of master node - {master.name}")

    # Find disks which have partitions and mount points to fill them
    ansible_extra = ns.get_ansibleextra(name=master.get_bmh_name())
    target_storage = ansible_extra.data['spec']['target_storage']

    parted_disks = [s for s in target_storage if 'partition_schema' in s]
    lvm_groups = [s for s in target_storage if 'lvm_groups' in s]
    raid_devices = [s for s in target_storage if 'md_devices' in s]

    mount_points = []
    for disk in parted_disks:
        partition_schema = disk['partition_schema']
        for partition in partition_schema:
            if 'mount' in partition:
                if partition['filesystem']['type'] in ('ext4', 'xfs'):
                    mount_points.append(partition['mount']['point'])

    for subgroup in lvm_groups:
        for group in subgroup['lvm_groups']:
            if not group['create']:
                continue

            for name in group['lvnames']:
                if not name['create']:
                    continue

                if 'mount' in name:
                    if name['filesystem']['type'] in ('ext4', 'xfs'):
                        mount_points.append(name['mount']['point'])

    for raid_device in raid_devices:
        md_devices = raid_device['md_devices']
        for md_device in md_devices:
            if not md_device.get('create'):
                continue
            if 'mount' in md_device:
                if md_device['filesystem']['type'] in ('ext4', 'xfs'):
                    mount_points.append(md_device['mount']['point'])

    assert mount_points, "No mount points found to fill the filesystem"

    LOG.info("Mount points to fill - %s", mount_points)
    master.exec_pod_cmd("df -h | grep -v /var/lib/docker | grep -v /var/lib/kubelet", verbose=True)

    commands = [
        f"sudo fallocate -l $(df {mount_point} -h --output=size | tail -n1) {mount_point}fill.out"
        for mount_point in mount_points]
    commands.append(
        "df -h | grep -v /var/lib/docker | grep -v /var/lib/kubelet"
    )
    command = " ; ".join(commands)
    LOG.info("Execute command - %s", command)
    try:
        master.exec_pod_cmd(cmd=command, verbose=True, timeout=30, delete_pod=False)
    except exceptions.TimeoutError:
        LOG.info("Got timeout error. Expected behavior after disk filling")

    time.sleep(30)  # wait to LB ip moves to new node
    cluster.k8sclient.login()  # try to relogin after LB ip moved to new node

    if cluster.workaround.field_5850():
        LOG.info("Wait until machine '%s' become not ready", master.name)
        waiters.wait(lambda: not master.data.get('status', {}).get('providerStatus', {}).get('ready'),
                     timeout=900, interval=30,
                     timeout_msg=f"Timeout waiting until machine '{master.name}' become Not Ready")
    else:
        LOG.info("Get k8s node for node %s", master.name)
        master_k8s_node = master.get_k8s_node()
        LOG.info("Wait until k8s node %s become not ready", master_k8s_node.name)
        cluster.check.wait_k8s_node_status(
            master_k8s_node.name,
            expected_status="NotReady",
            timeout=900)

    k8s_nodes = cluster.get_k8s_nodes()
    for node in k8s_nodes:
        LOG.info("Node %s has conditions: %s", node['metadata']['name'], node['status']['conditions'])
    if not cluster.is_child:
        deployments = cluster.k8sclient.deployments.list(namespace='kaas')
        deployments_with_replicas_one = [i for i in deployments if i.data['spec']['replicas'] == 1]

        waiters.wait(
            lambda: check_deployments(deployments_with_replicas_one),
            timeout=1200, interval=60,
            timeout_msg="Deployments with replicas = 1 not ready yet")

    waiters.wait(lambda: wait_machines_status_ready(cluster=cluster), interval=30, timeout=900,
                 timeout_msg="Waiting timeout for at least 2 machines in Ready status")

    show_step(3)
    new_master = cluster.day2operations.replace_baremetal_machine_and_bmh(master, machine_deletion_policy="unsafe")
    new_master_node_name = new_master.get_k8s_node_name()
    LOG.info(f"New Machine with master role: '{new_master.name}'")

    # Check machine runtime
    if settings.DESIRED_RUNTIME:
        cluster.check.compare_machines_runtime_with_desired([new_master])

    show_step(4)
    if cluster.is_child:
        LOG.info("Ceph cluster rebalancing may take some time. Wait timeout is 1h.")
        # Waiting for actual Ceph status from Ceph tools pod
        LOG.info("Wait Ceph HEALTH_OK status in Ceph tools")
        waiters.wait(lambda: wait_ceph_status(cluster), timeout=3600, interval=30)
        # Wait until KaaS update Cluster kind with Ceph status
        LOG.info("Wait Ceph HEALTH_OK status in cluster object")
        try:
            health_info = cluster.check.get_ceph_health_detail()
            assert health_info['status'] == "HEALTH_OK", f'Health is not OK. Will not proceed. ' \
                                                         f'Current ceph health status: {health_info}'
        except AssertionError:
            cluster.check.wait_ceph_health_status(timeout=600, interval=30)

    if cluster.clusterrelease_version.startswith(settings.MOSK_RELEASE_PREFIX) \
            and cluster.is_os_deployed():
        child_kubeconfig_name, child_kubeconfig = cluster.get_kubeconfig_from_secret()
        with open('child_conf', 'w') as f:
            f.write(child_kubeconfig)
        os_manager = OpenStackManager(kubeconfig='child_conf')
        if check_precaching:
            LOG.info("Wait image-precaching Ready")
            selector = f"spec.nodeName={new_master_node_name}"

            image_precaching_pod_timeout = 1800
            image_precaching_pod_timeout_msg = f"Image-precaching pod not found on node {new_master_node_name}"
            cluster.k8sclient.pods.wait_pod_present(timeout=image_precaching_pod_timeout,
                                                    interval=15,
                                                    timeout_msg=image_precaching_pod_timeout_msg,
                                                    namespace="openstack",
                                                    name_prefix='image-precaching',
                                                    field_selector=selector)
            image_precaching_pod = cluster.k8sclient.pods.list(namespace="openstack",
                                                               name_prefix='image-precaching',
                                                               field_selector=selector)
            assert image_precaching_pod, f"Image-precaching pod not found on node {new_master_node_name}"
            image_precaching_pod = image_precaching_pod[0]
            image_precaching_pod.wait_ready(timeout=5400)

        if version.parse(cluster.clusterrelease_version) < version.parse('mosk-17-1-0-rc-24-1'):
            show_step(5)  # Delete all Pending pvc for pods
            cluster.delete_pending_openstack_pods()

        if new_master.has_nodelabels([{'key': 'openstack-control-plane', 'value': 'enabled'}]):
            LOG.info("Perform actions for node with openstack control plane role")
            cluster.day2operations.recreate_octavia_resources_and_check(os_manager, k8s_node_name, new_master_node_name)

        show_step(6)
        LOG.info("Wait osdpl health status=Ready")
        os_manager.wait_openstackdeployment_health_status(timeout=1800)
        LOG.info("Wait os jobs to success and pods to become Ready")
        os_manager.wait_os_resources(timeout=1800)

    show_step(7)
    # Check/wait for correct docker service replicas in cluster
    ucp_worker_agent_name = cluster.check.get_ucp_worker_agent_name()
    cluster.check.check_actual_expected_docker_services(
        changed_after_upd={'ucp-worker-agent-x': ucp_worker_agent_name})
    cluster.check.check_k8s_pods()
    cluster.check.check_actual_expected_pods(timeout=3200)
    cluster.check.check_cluster_readiness()
    cluster.check.check_diagnostic_cluster_status()
    cluster.check.check_deploy_stage_success()

    cluster.check.check_bmh_inventory_presense()


@pytest.mark.usefixtures("introspect_distribution_not_changed")
@pytest.mark.usefixtures("collect_downtime_statistics")     # Should be used if ALLOW_WORKLOAD == True
@pytest.mark.usefixtures('create_hoc_before_lcm_and_delete_after')
def test_replace_broken_network_master_node(kaas_manager: Manager, show_step):
    """Replace BM control plane node.

    Scenario:
        1. Find master node where VIP hosted
        2. Collect data about BMH/secret/node
        3. Disable network on master node
        4. Replace master Machine
        5. Add new node to the Ceph cluster
        6. Delete all Pending openstack pvc for pods (optional)
        7. Check OpenStack readiness (optional)
        8. Check cluster readiness
    """

    cluster_name = settings.TARGET_CLUSTER
    namespace_name = settings.TARGET_NAMESPACE

    ns = kaas_manager.get_namespace(namespace_name)
    LOG.info("Namespace name - %s", namespace_name)

    cluster = ns.get_cluster(cluster_name)
    LOG.info("Cluster name - %s", cluster_name)
    show_step(1)
    master = cluster.get_keepalive_master_machine()
    check_precaching = is_precaching_required(master)
    k8s_node_name = master.get_k8s_node_name()
    if not cluster.is_child and replace_with_ironic(cluster, k8s_node_name):
        ironic_flag = True
        LOG.info(f"ironic_flag = {ironic_flag}")
    else:
        ironic_flag = False
        LOG.info(f"ironic_flag = {ironic_flag}")

    show_step(2)
    machine_bmh_name = master.metadata['annotations'].get('metal3.io/BareMetalHost')
    bmh = ns.get_baremetalhost(name=machine_bmh_name.split("/")[1])
    old_bmh_data = copy.deepcopy(bmh.data)

    show_step(3)
    LOG.info(f"Disable (block) network on master node - {master.name}")
    interfaces = [
        {"name": nic['name'], "mac": nic["mac"]}
        for nic in old_bmh_data['status']['hardware']['nics']
    ]
    nics_config_template = """[Match]

MACAddress={iface_mac}

Name={iface_name}


[Link]

Unmanaged=yes
"""

    def gen_nics_config(num, iface_name, iface_mac):
        return {
            "iface_name": iface_name,
            "filename": f"/etc/systemd/network/{num}-{iface_name}.network",
            "content": nics_config_template.format(iface_mac=iface_mac, iface_name=iface_name)
        }

    nics_config = [
        gen_nics_config(num, nic['name'], nic["mac"]) for num, nic in enumerate(interfaces)
    ]
    for nic_config in nics_config:
        LOG.info("Put config for %s to %s:\n%s", nic_config['iface_name'],
                 nic_config['filename'], nic_config['content'])
        master.exec_pod_cmd(
            f"echo \\\"{nic_config['content']}\\\" > {nic_config['filename']}",
            verbose=True)
    LOG.info("Remove already generated nics configs")
    master.exec_pod_cmd("mv /var/run/systemd/network /root/systemd_network", verbose=True)

    # TODO: upload nics config to the master node
    link_downs_cmds = []
    for nic_config in nics_config:
        link_downs_cmds.append(
            f"ip link set {nic_config['iface_name']} down"
        )
    link_downs_cmd = "; ".join(link_downs_cmds)
    LOG.info("Restart network manager")
    try:
        master.exec_pod_cmd(
            f"sleep 5; systemctl restart systemd-networkd; {link_downs_cmd}",
            verbose=True,
            timeout=50,
            delete_pod=False)
    except exceptions.TimeoutError as e:
        LOG.info("Got timeout error. To pass test after network disabling on the node %s", master.name, exc_info=e)

    time.sleep(30)  # wait to LB ip moves to new node
    cluster.k8sclient.login()  # try to relogin after LB ip moved to new node

    LOG.info("Get k8s node for node %s", master.name)
    LOG.info("Wait until k8s node %s become not ready", k8s_node_name)
    cluster.check.wait_k8s_node_status(
        k8s_node_name,
        expected_status="NotReady",
        timeout=900)

    k8s_nodes = cluster.get_k8s_nodes()
    for node in k8s_nodes:
        LOG.info("Node %s has conditions: %s", node['metadata']['name'], node['status']['conditions'])

    if not cluster.is_child:
        deployments = cluster.k8sclient.deployments.list(namespace='kaas')
        deployments_with_replicas_one = [i for i in deployments if i.data['spec']['replicas'] == 1]

        waiters.wait(
            lambda: check_deployments(deployments_with_replicas_one),
            timeout=1200, interval=60,
            timeout_msg="Deployments with replicas = 1 not ready yet")

    waiters.wait(lambda: wait_machines_status_ready(cluster=cluster), interval=30, timeout=900,
                 timeout_msg="Waiting timeout for at least 2 machines in Ready status")

    show_step(4)
    new_master = cluster.day2operations.replace_baremetal_machine_and_bmh(master, machine_deletion_policy="unsafe")
    new_master_node_name = new_master.get_k8s_node_name()
    LOG.info(f"New Machine with master role: '{new_master.name}'")

    # Check machine runtime
    if settings.DESIRED_RUNTIME:
        cluster.check.compare_machines_runtime_with_desired([new_master])

    show_step(5)
    if cluster.is_child:
        LOG.info("Ceph cluster rebalancing may take some time. Wait timeout is 1h.")
        # Waiting for actual Ceph status from Ceph tools pod
        LOG.info("Wait Ceph HEALTH_OK status in Ceph tools")
        waiters.wait(lambda: wait_ceph_status(cluster), timeout=3600, interval=30)
        # Wait until KaaS update Cluster kind with Ceph status
        LOG.info("Wait Ceph HEALTH_OK status in cluster object")
        try:
            health_info = cluster.check.get_ceph_health_detail()
            assert health_info['status'] == "HEALTH_OK", f'Health is not OK. Will not proceed. ' \
                                                         f'Current ceph health status: {health_info}'
        except AssertionError:
            cluster.check.wait_ceph_health_status(timeout=600, interval=30)

    if cluster.clusterrelease_version.startswith(settings.MOSK_RELEASE_PREFIX) \
            and cluster.is_os_deployed():
        child_kubeconfig_name, child_kubeconfig = cluster.get_kubeconfig_from_secret()
        with open('child_conf', 'w') as f:
            f.write(child_kubeconfig)
        os_manager = OpenStackManager(kubeconfig='child_conf')
        if check_precaching:
            LOG.info("Wait image-precaching Ready")
            selector = f"spec.nodeName={new_master_node_name}"

            image_precaching_pod_timeout = 1800
            image_precaching_pod_timeout_msg = f"Image-precaching pod not found on node {new_master_node_name}"
            cluster.k8sclient.pods.wait_pod_present(timeout=image_precaching_pod_timeout,
                                                    interval=15,
                                                    timeout_msg=image_precaching_pod_timeout_msg,
                                                    namespace="openstack",
                                                    name_prefix='image-precaching',
                                                    field_selector=selector)
            image_precaching_pod = cluster.k8sclient.pods.list(namespace="openstack",
                                                               name_prefix='image-precaching',
                                                               field_selector=selector)
            assert image_precaching_pod, f"Image-precaching pod not found on node {new_master_node_name}"
            image_precaching_pod = image_precaching_pod[0]
            image_precaching_pod.wait_ready(timeout=5400)

        if version.parse(cluster.clusterrelease_version) < version.parse('mosk-17-1-0-rc-24-1'):
            show_step(6)  # Delete all Pending pvc for pods
            cluster.delete_pending_openstack_pods()

        if new_master.has_nodelabels([{'key': 'openstack-control-plane', 'value': 'enabled'}]):
            LOG.info("Perform actions for node with openstack control plane role")
            cluster.day2operations.recreate_octavia_resources_and_check(os_manager, k8s_node_name, new_master_node_name)

        cluster.day2operations.cleanup_ovn_db_inactive_members(new_master_node_name)

        show_step(7)
        LOG.info("Wait osdpl health status=Ready")
        os_manager.wait_openstackdeployment_health_status(timeout=1800)
        LOG.info("Wait os jobs to success and pods to become Ready")
        os_manager.wait_os_resources(timeout=1800)

    show_step(8)
    # Check/wait for correct docker service replicas in cluster
    ucp_worker_agent_name = cluster.check.get_ucp_worker_agent_name()
    cluster.check.check_actual_expected_docker_services(
        changed_after_upd={'ucp-worker-agent-x': ucp_worker_agent_name})
    cluster.check.check_k8s_pods()
    cluster.check.check_actual_expected_pods(timeout=3200)
    cluster.check.check_cluster_readiness()
    cluster.check.check_diagnostic_cluster_status()
    cluster.check.check_deploy_stage_success()

    cluster.check.check_bmh_inventory_presense()
