Implement automated tests for Salt master backup/restore

Related-PROD: PROD-32676
Change-Id: Ied9b5dcc6d0b87bfcea5cbdbffe48fa98d26e8e3
diff --git a/tcp_tests/tests/system/test_backup_restore.py b/tcp_tests/tests/system/test_backup_restore.py
index b99f408..4e9c46a 100644
--- a/tcp_tests/tests/system/test_backup_restore.py
+++ b/tcp_tests/tests/system/test_backup_restore.py
@@ -13,8 +13,12 @@
 #    under the License.
 import pytest
 
+from devops.helpers import helpers
+from devops.helpers.proc_enums import ExitCodes
+
 from tcp_tests import logger
 from tcp_tests.managers import backup_restore_manager
+from tcp_tests import settings
 
 LOG = logger.logger
 
@@ -22,10 +26,381 @@
 class TestBackupRestoreMaster(object):
     """Test class for testing backup restore of master node"""
 
+    ENV_NAME = settings.ENV_NAME
+
+    BCKP_SERVER_DIR = "/srv/volumes/backup/backupninja"
+    RECLASS_DIR = "/srv/salt/reclass"
+    FILES_TO_DELETE = [
+        "nodes/_generated/log02.{}.local.yml".format(ENV_NAME),
+        "classes/cluster/{}/stacklight/telemetry.yml".format(ENV_NAME),
+        "classes/service/barbican",
+        "classes/system/prometheus/alertmanager/container.yml"
+    ]
+    FILES_TO_UPDATE = [
+        "nodes/_generated/mtr02.{}.local.yml".format(ENV_NAME),
+        "classes/cluster/{}/ceph/rgw.yml".format(ENV_NAME),
+        "classes/system/grafana/client/single.yml"
+    ]
+
+    BACKUP_JOB_NAME = 'backupninja_backup'
+    BACKUP_JOB_PARAMETERS = {
+        "ASK_CONFIRMATION": False
+    }
+    RESTORE_JOB_NAME = 'backupninja_restore'
+    JENKINS_START_TIMEOUT = 60
+    JENKINS_BUILD_TIMEOUT = 60 * 30
+
+    @pytest.fixture
+    def delete_backup(self, underlay_actions, salt_actions):
+        """Remove Salt master backup and/or restore flag files
+
+        If exists, remove existing backup(s) form backup server.
+        If exists, remove '/srv/salt/master-restored' and
+        '/srv/salt/minion-restored' flag files, which indicate that Salt master
+        backup restore procedure has already been executed.
+
+        Execute cleanup before a test (to prepare clean environment) and after
+        the test (to not affect any later activities on the environment).
+
+        :param underlay_actions: UnderlaySSHManager, tcp-qa SSH manager
+            instance
+        :param salt_actions: SaltManager, tcp-qa Salt manager instance
+        """
+        client = salt_actions.local(
+            "I@backupninja:client", "test.ping")['return'][0].keys()[0]
+        server = salt_actions.local(
+            "I@backupninja:server", "test.ping")['return'][0].keys()[0]
+
+        def cleanup(underlay_actions, server, client):
+            # Delete backups, if any, from backup server
+            path = "{}/{}".format(self.BCKP_SERVER_DIR, client)
+            underlay_actions.check_call(
+                "rm -rf {}".format(path), node_name=server, raise_on_err=False)
+
+            # Delete restore flag files from backup client, if exist
+            for f in ("minion-restored", "master-restored"):
+                underlay_actions.check_call(
+                    "rm /srv/salt/{}".format(f),
+                    node_name=client,
+                    raise_on_err=False)
+
+        cleanup(underlay_actions, server, client)
+        yield
+        cleanup(underlay_actions, server, client)
+
+    def check_salt_master_backup(self, ssh, server, path, client):
+        """Check that data directories exist in backup on backup server
+
+        :param ssh: UnderlaySSHManager, tcp-qa SSH manager instance
+        :param server: string, backup server node where backup is stored
+        :param path: string, path to backupninja inventory of backups on server
+        :param client: string, backup client node name, which indicates the
+            name of backup on backup server
+        """
+        for subdir in ("etc", "srv", "var"):
+            cmd = "test -d {}/{}/{}".format(path, client, subdir)
+            result = ssh.check_call(
+                cmd, node_name=server, raise_on_err=False)['exit_code']
+            assert result == ExitCodes.EX_OK, (
+                "'{}' data from Salt master is not in backup.".format(subdir))
+
+    def delete_reclass_files(self, ssh, client):
+        """Delete several reclass files
+
+        :param ssh: UnderlaySSHManager, tcp-qa SSH manager instance
+        :param client: string, backup client node where files are deleted
+        """
+        files_to_delete = " ".join(self.FILES_TO_DELETE)
+        ssh.check_call(
+            "cd {}; rm {}".format(self.RECLASS_DIR, files_to_delete),
+            node_name=client,
+            raise_on_err=False)
+
+    def update_reclass_files(self, ssh, client):
+        """Update several reclass files
+
+        :param ssh: UnderlaySSHManager, tcp-qa SSH manager instance
+        :param client: string, backup client node where files are updated
+        :return: dict, key-value pairs of files and their hashes before update
+        """
+        hashes = {}
+        for f in self.FILES_TO_UPDATE:
+            path = "{}/{}".format(self.RECLASS_DIR, f)
+            # Calculate hash of a file
+            hashes[f] = ssh.check_call(
+                "sha1sum {} | awk '{{print $1}}'".format(path),
+                node_name=client,
+                raise_on_err=False)['stdout']
+            # Update a file with a dummy string
+            ssh.check_call(
+                "echo '{}' >> {}".format("#" * 200, path),
+                node_name=client,
+                raise_on_err=False)
+        return hashes
+
+    def update_backup_schedule(self, reclass):
+        """Update backup schedule on backupninja client
+
+        :param reclass: ReclassManager, tcp-qa Reclass-tools manager
+        """
+        path = "cluster/*/infra/config/init.yml"
+        reclass.add_bool_key("parameters.backupninja.enabled", "True", path)
+        reclass.add_key(
+            "parameters.backupninja.client.backup_times.minute",
+            "\"'*/10'\"",
+            path)
+
+    def verify_restored_data(self, ssh, client, hashes):
+        """Verify restore of deleted and updated reclass files
+
+        :param ssh: UnderlaySSHManager, tcp-qa SSH manager instance
+        :param client: string, backup client node where files are updated
+        :param hashes: dict, key-value pairs of files and their hashes
+            before update
+        """
+        # Verify that deleted files are restored
+        for f in self.FILES_TO_DELETE:
+            path = "{}/{}".format(self.RECLASS_DIR, f)
+            result = ssh.check_call(
+                "test -f {}".format(path),
+                node_name=client,
+                raise_on_err=False)['exit_code']
+            assert result == ExitCodes.EX_OK, (
+                "'{}' data is not in restored on Salt master.".format(path))
+        # Verify that changed files are restored
+        for f in self.FILES_TO_UPDATE:
+            path = "{}/{}".format(self.RECLASS_DIR, f)
+            f_hash = ssh.check_call(
+                "sha1sum {} | awk '{{print $1}}'".format(path),
+                node_name=client,
+                raise_on_err=False)['stdout']
+            assert hashes[f] == f_hash, (
+                "'{}' data is not in restored on Salt master.".format(path))
+
+    @pytest.mark.grab_versions
+    @pytest.mark.salt_master_manual_backup_restore
+    def test_salt_master_manual_backup_restore(
+            self, underlay_actions, salt_actions, show_step, delete_backup):
+        """Test manual backup restore of Salt master data
+
+        Scenario:
+            1. Backup Salt master node
+            2. Verify that Salt master backup is created on backupninja server
+               node
+            3. Delete/change some reclass data
+            4. Restore the backup
+            5. Verify that Salt master data backup is restored
+            6. Verify that minions are responding
+
+        Duration: ~ 3 min
+        """
+        salt = salt_actions
+        ssh = underlay_actions
+
+        backup_client = salt.local(
+            "I@backupninja:client", "test.ping")['return'][0].keys()[0]
+        backup_server = salt.local(
+            "I@backupninja:server", "test.ping")['return'][0].keys()[0]
+
+        # Create backup by moving local files to the 'backupninja' server
+        show_step(1)
+        cmd = "backupninja -n --run /etc/backup.d/200.backup.rsync"
+        ssh.check_call(
+            cmd, node_name=backup_client, raise_on_err=False, timeout=60 * 4)
+
+        # Verify that backup is created and all pieces of data are rsynced
+        # to backupninja server
+        show_step(2)
+        self.check_salt_master_backup(
+            ssh, backup_server, self.BCKP_SERVER_DIR, backup_client)
+
+        # Simulate loss/change of some reclass data
+        show_step(3)
+        self.delete_reclass_files(ssh, backup_client)
+        hashes = self.update_reclass_files(ssh, backup_client)
+
+        # Restore the backup
+        show_step(4)
+        ssh.check_call(
+            "salt-call state.sls salt.master.restore,salt.minion.restore",
+            node_name=backup_client,
+            raise_on_err=False,
+            timeout=60 * 4)
+
+        # Verify that all pieces of lost/changed data are restored
+        show_step(5)
+        self.verify_restored_data(ssh, backup_client, hashes)
+
+        # Ping minions
+        show_step(6)
+        salt.local('*', "test.ping", timeout=30)
+
+    @pytest.mark.grab_versions
+    @pytest.mark.salt_master_manual_backup_restore_pipeline
+    def test_salt_master_manual_backup_restore_pipeline(
+            self,
+            underlay_actions,
+            salt_actions,
+            drivetrain_actions,
+            show_step,
+            delete_backup):
+        """Test manual backup restore of Salt master data using DT pipeline
+
+        Scenario:
+            1. Execute 'backupninja_backup' pipeline to backup Salt
+               master node
+            2. Verify that Salt master backup is created on backupninja server
+               node
+            3. Delete/change some reclass data
+            4. Restore the backup
+            5. Verify that Salt master data backup is restored
+            6. Verify that minions are responding
+
+        Duration: ~ 3 min
+        """
+        salt = salt_actions
+        ssh = underlay_actions
+        dt = drivetrain_actions
+
+        backup_client = salt.local(
+            "I@backupninja:client", "test.ping")['return'][0].keys()[0]
+        backup_server = salt.local(
+            "I@backupninja:server", "test.ping")['return'][0].keys()[0]
+
+        # Execute 'backupninja_backup' pipeline to create a backup
+        show_step(1)
+        status = dt.start_job_on_cid_jenkins(
+            job_name=self.BACKUP_JOB_NAME,
+            job_parameters=self.BACKUP_JOB_PARAMETERS,
+            start_timeout=self.JENKINS_START_TIMEOUT,
+            build_timeout=self.JENKINS_BUILD_TIMEOUT
+        )
+        assert status == 'SUCCESS', (
+            "'{}' job run status is {} after creating Salt master backup. "
+            "Please check the build and executed stages.".format(
+                self.BACKUP_JOB_NAME, status)
+        )
+
+        # Verify that backup is created and all pieces of data are rsynced
+        # to backupninja server
+        show_step(2)
+        self.check_salt_master_backup(
+            ssh, backup_server, self.BCKP_SERVER_DIR, backup_client)
+
+        # Simulate loss/change of some reclass data
+        show_step(3)
+        self.delete_reclass_files(ssh, backup_client)
+        hashes = self.update_reclass_files(ssh, backup_client)
+
+        # Restore the backup
+        show_step(4)
+        status = dt.start_job_on_cid_jenkins(
+            job_name=self.RESTORE_JOB_NAME,
+            start_timeout=self.JENKINS_START_TIMEOUT,
+            build_timeout=self.JENKINS_BUILD_TIMEOUT
+        )
+        assert status == 'SUCCESS', (
+            "'{}' job run status is {} after restoring from Salt master "
+            "backup. Please check the build and executed stages.".format(
+                self.RESTORE_JOB_NAME, status)
+        )
+
+        # Verify that all pieces of lost/changed data are restored
+        show_step(5)
+        self.verify_restored_data(ssh, backup_client, hashes)
+
+        # Ping minions
+        show_step(6)
+        salt.local('*', "test.ping", timeout=30)
+
+    @pytest.mark.grab_versions
+    @pytest.mark.salt_master_scheduled_backup_restore
+    def test_salt_master_scheduled_backup_restore(
+            self,
+            underlay_actions,
+            salt_actions,
+            reclass_actions,
+            show_step,
+            delete_backup):
+        """Test scheduled backup restore of Salt master data
+
+        Scenario:
+            1. Update Salt master backup schedule to run every 5 minutes
+            2. Apply 'backupninja' state on the backupninja client node
+            3. Wait until backup creation is triggered by schedule
+            4. Wait until backup creation is finished
+            5. Verify that Salt master backup is created on backupninja server
+               node
+            6. Delete/change some reclass data
+            7. Restore the backup
+            8. Verify that Salt master data backup is restored
+            9. Verify that minions are responding
+
+        Duration: ~ 3 min
+        """
+        salt = salt_actions
+        ssh = underlay_actions
+        reclass = reclass_actions
+
+        backup_client = salt.local(
+            "I@backupninja:client", "test.ping")['return'][0].keys()[0]
+        backup_server = salt.local(
+            "I@backupninja:server", "test.ping")['return'][0].keys()[0]
+
+        # Re-configure backup schedule
+        show_step(1)
+        self.update_backup_schedule(reclass)
+
+        # Apply 'backupninja' state on backupninja client node
+        show_step(2)
+        salt.enforce_state("I@backupninja:client", "backupninja")
+
+        # Wait until backup is triggered by schedule
+        show_step(3)
+        helpers.wait_pass(
+            lambda: ssh.check_call(
+                cmd="pgrep backupninja && echo OK", node_name=backup_client),
+            timeout=60 * 11,
+            interval=5)
+
+        # Wait until backup is finished
+        show_step(4)
+        ssh.check_call(
+            cmd="while pgrep backupninja > /dev/null; do sleep 2; done",
+            node_name=backup_client,
+            timeout=60 * 5)
+
+        # Verify that backup is created and all pieces of data are rsynced
+        # to backupninja server
+        show_step(5)
+        self.check_salt_master_backup(
+            ssh, backup_server, self.BCKP_SERVER_DIR, backup_client)
+
+        # Simulate loss/change of some reclass data
+        show_step(6)
+        self.delete_reclass_files(ssh, backup_client)
+        hashes = self.update_reclass_files(ssh, backup_client)
+
+        # Restore the backup
+        show_step(7)
+        ssh.check_call(
+            "salt-call state.sls salt.master.restore,salt.minion.restore",
+            node_name=backup_client,
+            raise_on_err=False,
+            timeout=60 * 4)
+
+        # Verify that all pieces of lost/changed data are restored
+        show_step(8)
+        self.verify_restored_data(ssh, backup_client, hashes)
+
+        # Ping minions
+        show_step(9)
+        salt.local('*', "test.ping", timeout=30)
+
     @pytest.mark.grab_versions
     @pytest.mark.fail_snapshot
     @pytest.mark.backup_all
-    def test_backup_cfg_backupninja_rsync(
+    def _test_backup_cfg_backupninja_rsync(
             self, underlay, config, openstack_deployed,
             salt_actions, show_step):
         """Test backup restore master node