#    Copyright 2025 Mirantis, Inc.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

import pytest
import random

from si_tests import logger
from si_tests import settings

from si_tests.utils import waiters
from si_tests.managers.remote_shell_manager import RemoteShellManager

LOG = logger.logger

cluster_name = settings.TARGET_CLUSTER
namespace_name = settings.TARGET_NAMESPACE


@pytest.mark.usefixtures("collect_downtime_statistics")     # Should be used if ALLOW_WORKLOAD == True
def test_linux_hard_limits_module_apply_conf(kaas_manager, show_step):
    """ Test linux_hard_limits hoc module
        Scenario:
            1. Check init state cluster
            2. Add label to main machines
            3. Create a linux_hard_limits HostOSConfiguration object
            4. Wait while linux_hard_limits module configuration is applied.
            5. Reboot machines
            6. Check hard limits on machines
            7. Revert config
            8. Reboot machines
            9. Check cluster readiness
    """

    ns = kaas_manager.get_namespace(namespace_name)
    cluster = ns.get_cluster(cluster_name)

    machines = cluster.get_machines()
    assert machines, f"No machines in cluster: {cluster_name}"
    first_machine = machines[0]

    show_step(1)
    LOG.info(f"Check init state on the {cluster._cluster_type} cluster {cluster.namespace}/{cluster.name}")
    cluster.check.check_machines_status()
    cluster.check.check_cluster_readiness()
    cluster.check.check_k8s_nodes()
    cluster.check.wait_graceful_reboot_request(expected_status=False)

    show_step(2)
    LOG.info(f"Add label day2_label to first machine: {first_machine.name} in child namespace: {cluster_name}")
    day2_label = {"day2-custom-hard-limits-label" + str(random.randint(10, 99)): "true"}
    first_machine.add_machine_labels(day2_label)

    show_step(3)
    LOG.info(f"Save default hard limits for first machine: {first_machine.name}")
    default_hard_limits_list = RemoteShellManager.get_linux_hard_limit(machine=first_machine)

    show_step(4)
    # Create or update exist HostOSConfiguration
    hostoscfg_data = {
        'apiVersion': 'kaas.mirantis.com/v1alpha1',
        'kind': 'HostOSConfiguration', 'metadata':
            {'name': settings.HOC_LHL_TEST_HOC_NAME,
             'namespace': namespace_name
             },
        'spec': {
            'configs': [{
                'module': 'linux_hard_limits',
                'moduleVersion': settings.HOC_LHL_PACKAGE_MODULE_VERSION,
                'values': {
                    'cleanup_before': True,
                    'system': {
                        'nofile': 526336,
                        'nproc': 1050624
                    },
                    'users': {
                        'mcc-user': {
                            'nofile': 525312,
                            'nproc': 1049600
                        }
                    }
                }
            }],
            'machineSelector': {
                'matchLabels': day2_label
            }}
    }
    # See https://mirantis.jira.com/browse/PRODX-51379
    # WARNING: on host with running Docker Swarm setting limits value
    # for `system` or `root` lower than listed below will cause Docker Swarm to fail:
    # `nofile`: `524288`, `nproc`: `1048576`

    lcmmachines_timestamps_before = cluster.get_cluster_lcmmachines_timestamps()
    if ns.hostosconfiguration_is_present(name=settings.HOC_LHL_TEST_HOC_NAME):
        hostoscfg = ns.get_hostosconfiguration(name=settings.HOC_LHL_TEST_HOC_NAME)
        hostoscfg.patch(hostoscfg_data)
    else:
        hostoscfg = ns.create_hostosconfiguration_raw(hostoscfg_data)

    # Wait for the selected Machine in the hostosconfiguration status
    LOG.info("Check that new items added into stateItems in LCMMachine")
    cluster.check.wait_lcmmachine_day2_stateitems(hostoscfg, lcmmachines_timestamps_before)
    cluster.check.check_cluster_readiness()
    cluster.check.get_hostosconfig_machines_status(hostoscfg)

    # Wait for the selected Machine in the hostosconfiguration status
    show_step(5)
    LOG.info(f"Creating Graceful Reboot Request for {first_machine.name} "
             f"in cluster {cluster.namespace}/{cluster.name}")

    ns.create_gracefulrebootrequest_object(cluster.name, namespace_name, [first_machine.name])
    waiters.wait(
        lambda: cluster.day2operations.get_reboot_required_status([first_machine.name], namespace_name),
        timeout=3600, interval=10,
        timeout_msg="Wait for 'reboot' warning for selected machines")

    cluster.check.wait_graceful_reboot_request(expected_status=True)

    LOG.info(f'Waiting for {first_machine.name} to reboot')
    boot_time_dict = cluster.get_boot_time_dict(exclude_bastion=True, lcm_machines=[first_machine.lcmmachine])
    machines_number = len(boot_time_dict.keys())
    # Rebooting BM machines takes about 10-15 minutes, but sometimes may take 25+ minutes
    machines_reboot_timeout = 1800 * machines_number
    cluster.check.wait_machines_reboot(boot_time_dict, timeout=machines_reboot_timeout)

    LOG.info(f"Check cluster {cluster.namespace}/{cluster.name} for init state")
    cluster.check.wait_graceful_reboot_request(expected_status=False, timeout=600)
    cluster.check.check_machines_status()
    cluster.check.check_cluster_readiness()

    LOG.info("Check that new items added into machineTypes in LCMCluster")
    cluster.check.wait_lcmcluster_day2_machinetypes(hostoscfg)
    LOG.info("Check that new items added into stateItems in LCMMachine")
    cluster.check.wait_lcmmachine_day2_stateitems(hostoscfg, lcmmachines_timestamps_before)
    cluster.check.get_hostosconfig_machines_status(hostoscfg)

    show_step(6)
    LOG.info(f"Check that hard limits were updated on machine: {first_machine.name}")
    hard_limits_list = RemoteShellManager.get_linux_hard_limit(machine=first_machine)

    hostoscfg_values = hostoscfg_data['spec']['configs'][0]['values']
    mcc_user_values = hostoscfg_values['users']['mcc-user']
    system_values = hostoscfg_values['system']
    assert mcc_user_values['nproc'] == hard_limits_list['mcc-user']['nproc']
    assert mcc_user_values['nofile'] == hard_limits_list['mcc-user']['nofile']
    assert system_values['nproc'] == hard_limits_list['root']['nproc']
    assert system_values['nofile'] == hard_limits_list['root']['nofile']

    show_step(7)
    # Remember LCMMachines timestamps before creating HostOSConfiguration
    lcmmachines_timestamps_before = cluster.get_cluster_lcmmachines_timestamps()

    LOG.info(f"Cleanup configuration '{first_machine.name}'")
    hostoscfg.patch(
        {"spec": {
            "configs": [{
                "module": "linux_hard_limits",
                "moduleVersion": settings.HOC_LHL_PACKAGE_MODULE_VERSION,
                "values": {
                    "cleanup_before": True
                }
            }],
            "machineSelector": {
                "matchLabels":
                    day2_label
            }
        }})

    show_step(8)
    # Wait for the selected Machines in the hostosconfiguration status
    LOG.info("Check that machines from hostosconfiguration status field have labels used for machineSelector")
    cluster.check.check_hostosconfig_machine_selector(hostoscfg)
    LOG.info("Check that new items added into machineTypes in LCMCluster")
    cluster.check.wait_lcmcluster_day2_machinetypes(hostoscfg)
    LOG.info("Check that new items added into stateItems in LCMMachine")
    cluster.check.wait_lcmmachine_day2_stateitems(hostoscfg, lcmmachines_timestamps_before)
    cluster.check.get_hostosconfig_machines_status(hostoscfg)

    # Waiting for machines are Ready
    cluster.check.check_machines_status()
    cluster.check.check_cluster_readiness()

    LOG.info(f"Creating Graceful Reboot Request for {first_machine.name} "
             f"machine in cluster {cluster.namespace}/{cluster.name}")
    ns.create_gracefulrebootrequest_object(cluster.name, namespace_name, [first_machine.name])
    waiters.wait(
        lambda: cluster.day2operations.get_reboot_required_status([first_machine.name], namespace_name),
        timeout=3600, interval=10,
        timeout_msg="Wait for 'reboot' warning for selected machines")

    cluster.check.wait_graceful_reboot_request(expected_status=True)

    LOG.info(f"Delete HostOSConfiguration '{first_machine.name}'")
    existing_config = ns.get_hostosconfiguration(name=settings.HOC_LHL_TEST_HOC_NAME)
    existing_config.delete(async_req=True)
    timeout_msg = f"HostOSConfiguration {settings.HOC_LHL_TEST_HOC_NAME} was not deleted"
    waiters.wait(lambda: not bool(ns.hostosconfiguration_is_present(name=settings.HOC_LHL_TEST_HOC_NAME)),
                 timeout=1200,
                 interval=10,
                 timeout_msg=timeout_msg)

    show_step(9)
    LOG.info(f"Check cluster {cluster.namespace}/{cluster.name} after delete HostOSConfiguration")
    cluster.check.wait_graceful_reboot_request(expected_status=False, timeout=600)
    cluster.check.check_machines_status()
    cluster.check.check_cluster_readiness()

    LOG.info(f"Check change to default hard limits on machine: {first_machine.name}")
    assert default_hard_limits_list == RemoteShellManager.get_linux_hard_limit(machine=first_machine)
