Automated Paсkage versions update for tags

Module repos.py
 - ReposInfo(), walks mirror using HTTP and builds
   all repos map available
 - RepoManager(), using repos map builds package versions map
   either for specific tags or for all of them

Fixes:
 - Progress class clears line remainder on change

Utils:
 - Download GZ file into memory
 - TGZ file interface, CRU (no deletion)

Change-Id: Ifdb37aa4b68fb25f642b2089cf16cd242ed25a0b
Related-PROD: PROD-28199
diff --git a/cfg_checker/modules/packages/__init__.py b/cfg_checker/modules/packages/__init__.py
index a4a81bd..f88ce51 100644
--- a/cfg_checker/modules/packages/__init__.py
+++ b/cfg_checker/modules/packages/__init__.py
@@ -1,4 +1,5 @@
 from cfg_checker.helpers import args_utils
+from cfg_checker.modules.packages.repos import RepoManager
 
 import checker
 
@@ -29,6 +30,36 @@
         help="CSV filename to save report"
     )
 
+    pkg_repos = pkg_subparsers.add_parser(
+        'versions',
+        help="Parse versions at given URL and create local map"
+    )
+    pkg_repos.add_argument(
+        '--list-tags',
+        action="store_true", default=False,
+        help="Just list tags available in mirror"
+    )
+    pkg_repos.add_argument(
+        '--url',
+        metavar='repo_url', default="http://mirror.mirantis.com",
+        help="URL for repos, default: http://mirror.mirantis.com"
+    )
+    pkg_repos.add_argument(
+        '--tag',
+        metavar='repo_tag', default=None,
+        help="Repository tag to process packages from. Default: "
+        "All url's root folder tags"
+    )
+    pkg_repos.add_argument(
+        '--build-repos',
+        action="store_true", default=False,
+        help="Conduct build stage before working with tags"
+    )
+    pkg_repos.add_argument(
+        '--gen-desc',
+        action="store_true", default=False,
+        help="Save pkg descriptions while parsing"
+    )
     return _parser
 
 
@@ -48,3 +79,32 @@
     pChecker.collect_packages()
     # report it
     pChecker.create_report(_filename, rtype=_type, full=args.full)
+
+
+def do_versions(args):
+    """Builds tagged repo structure and parses Packages.gz files
+
+    :args: - parser arguments
+    :return: - no return value
+    """
+    # Get the list of tags for the url
+    r = RepoManager()
+    if args.build_repos:
+        # if tag is supplied, use it
+        if args.tag:
+            r.action_for_tag(args.url, args.tag, action="build")
+        else:
+            r.build_repos(args.url)
+
+    # if tag is supplied, use it
+    if args.tag:
+        # Process only this tag
+        r.action_for_tag(
+            args.url,
+            args.tag,
+            action="fetch",
+            descriptions=args.gen_desc
+        )
+    else:
+        # All of them
+        r.parse_repos()
diff --git a/cfg_checker/modules/packages/checker.py b/cfg_checker/modules/packages/checker.py
index 0bcb1a6..514bd9c 100644
--- a/cfg_checker/modules/packages/checker.py
+++ b/cfg_checker/modules/packages/checker.py
@@ -93,7 +93,7 @@
                 _eo += _val['results'].keys().count(const.VERSION_ERR)
                 _do += _val['results'].keys().count(const.VERSION_DOWN)
 
-        _progress.newline()
+        _progress.end()
 
         _data['errors'] = {
             'mirantis': _ec,
@@ -239,7 +239,7 @@
                 }
 
         self._packages = _all_packages
-        _progress.newline()
+        _progress.end()
 
     def create_report(self, filename, rtype, full=None):
         """
diff --git a/cfg_checker/modules/packages/repos.py b/cfg_checker/modules/packages/repos.py
new file mode 100644
index 0000000..2a91ed5
--- /dev/null
+++ b/cfg_checker/modules/packages/repos.py
@@ -0,0 +1,636 @@
+import json
+import os
+from copy import deepcopy
+
+from cfg_checker.common import logger, logger_cli
+from cfg_checker.common.const import _pkg_desc_archive
+from cfg_checker.common.const import _repos_index_filename
+from cfg_checker.common.const import _repos_info_archive
+from cfg_checker.common.const import _repos_versions_archive
+from cfg_checker.common.const import _repos_versions_filename
+from cfg_checker.common.const import ubuntu_releases
+from cfg_checker.common.file_utils import get_gzipped_file
+from cfg_checker.common.settings import pkg_dir
+from cfg_checker.helpers.console_utils import Progress
+from cfg_checker.helpers.tgz import TGZFile
+
+import requests
+from requests.exceptions import ConnectionError
+
+ext = ".json"
+
+
+def _n_url(url):
+    if url[-1] == '/':
+        return url
+    else:
+        return url + '/'
+
+
+class ReposInfo(object):
+    repos = []
+    _repofile = os.path.join(pkg_dir, "versions", _repos_info_archive)
+
+    @staticmethod
+    def _ls_repo_page(url):
+        # Yes, this is ugly. But it works ok for small HTMLs.
+        _a = "<a"
+        _s = "href="
+        _e = "\">"
+        try:
+            page = requests.get(url, timeout=60)
+        except ConnectionError as e:
+            logger_cli.error("# ERROR: {}".format(e.message))
+            return [], []
+        a = page.text.splitlines()
+        # Comprehension for dirs. Anchors for ends with '-'
+        _dirs = [l[l.index(_s)+6:l.index(_e)-1]
+                 for l in a if l.startswith(_a) and l.endswith('-')]
+        # Comprehension for files. Anchors ends with size
+        _files = [l[l.index(_s)+6:l.index(_e)]
+                  for l in a if l.startswith(_a) and not l.endswith('-')]
+
+        return _dirs, _files
+
+    def search_pkg(self, url, _list):
+        # recoursive method to walk dists tree
+        _dirs, _files = self._ls_repo_page(url)
+
+        for _d in _dirs:
+            # Search only in dists, ignore the rest
+            if "dists" not in url and _d != "dists":
+                continue
+            _u = _n_url(url + _d)
+            self.search_pkg(_u, _list)
+
+        for _f in _files:
+            if _f == "Packages.gz":
+                _list.append(url + _f)
+                logger.debug("... [F] '{}'".format(url + _f))
+
+        return _list
+
+    @staticmethod
+    def _map_repo(_path_list, _r):
+        for _pkg_path in _path_list:
+            _l = _pkg_path.split('/')
+            _kw = _l[_l.index('dists')+1:]
+            _kw.reverse()
+            _repo_item = {
+                "arch": _kw[1][7:] if "binary" in _kw[1] else _kw[1],
+                "type": _kw[2],
+                "ubuntu-release": _kw[3],
+                "filepath": _pkg_path
+            }
+            _r.append(_repo_item)
+
+    def _find_tag(self, _t, _u, label=""):
+        if label:
+            _url = _n_url(_u + label)
+            _label = _t + '.' + label
+        else:
+            _url = _u
+            _label = _t
+        _ts, _ = self._ls_repo_page(_url)
+        if _t in _ts:
+            logger.debug(
+                "... found tag '{}' at '{}'".format(
+                    _t,
+                    _url
+                )
+            )
+            return {
+                _label: {
+                    "baseurl": _n_url(_url + _t),
+                    "all": {}
+                }
+            }
+        else:
+            return {}
+
+    def fetch_repos(self, url, tag=None):
+        base_url = _n_url(url)
+        logger_cli.info("# Using '{}' as a repos source".format(base_url))
+
+        logger_cli.info("# Gathering repos info (i.e. links to 'packages.gz')")
+        # init repoinfo archive
+        _repotgz = TGZFile(self._repofile)
+        # prepare repo links
+        _repos = {}
+        if tag:
+            # only one tag to process
+            _repos.update(self._find_tag(tag, base_url))
+            _repos.update(self._find_tag(tag, base_url, label="hotfix"))
+            _repos.update(self._find_tag(tag, base_url, label="update"))
+        else:
+            # gather all of them
+            _tags, _ = self._ls_repo_page(base_url)
+            _tags.remove('hotfix')
+            _tags.remove('update')
+            # search tags in subfolders
+            _h_tags, _ = self._ls_repo_page(base_url + 'hotfix')
+            _u_tags, _ = self._ls_repo_page(base_url + 'update')
+            _tags.extend([t for t in _h_tags if t not in _tags])
+            _tags.extend([t for t in _u_tags if t not in _tags])
+            _progress = Progress(len(_tags))
+            _index = 0
+            for _tag in _tags:
+                _repos.update(self._find_tag(_tag, base_url))
+                _repos.update(self._find_tag(_tag, base_url, label="hotfix"))
+                _repos.update(self._find_tag(_tag, base_url, label="update"))
+                _index += 1
+                _progress.write_progress(_index)
+            _progress.end()
+
+        # parse subtags
+        for _label in _repos.keys():
+            logger_cli.info("-> processing tag '{}'".format(_label))
+            _name = _label + ".json"
+            if _repotgz.has_file(_name):
+                logger_cli.info(
+                    "-> skipping, '{}' already has '{}'".format(
+                        _repos_info_archive,
+                        _name
+                    )
+                )
+                continue
+            # process the tag
+            _repo = _repos[_label]
+            _baseurl = _repos[_label]["baseurl"]
+            # get the subtags
+            _sub_tags, _ = self._ls_repo_page(_baseurl)
+            _total_index = len(_sub_tags)
+            _index = 0
+            _progress = Progress(_total_index)
+            logger.debug(
+                "... found {} subtags for '{}'".format(
+                    len(_sub_tags),
+                    _label
+                )
+            )
+            # save the url and start search
+            for _stag in _sub_tags:
+                _u = _baseurl + _stag
+                _index += 1
+                logger.debug(
+                    "... searching repos in '{}/{}'".format(
+                        _label,
+                        _stag
+                    )
+                )
+
+                # Searching Package collections
+                if _stag in ubuntu_releases:
+                    # if stag is the release, this is all packages
+                    _repo["all"][_stag] = []
+                    _repo["all"]["url"] = _n_url(_u)
+                    _path_list = self.search_pkg(_n_url(_u), [])
+                    self._map_repo(_path_list, _repo["all"][_stag])
+                    logger.info(
+                        "-> found {} dists".format(
+                            len(_repo["all"][_stag])
+                        )
+                    )
+
+                else:
+                    # each subtag might have any ubuntu release
+                    # so iterate them
+                    _repo[_stag] = {
+                        "url": _n_url(_u)
+                    }
+                    _releases, _ = self._ls_repo_page(_n_url(_u))
+                    for _rel in _releases:
+                        if _rel not in ubuntu_releases:
+                            logger.debug(
+                                "... skipped unknown ubuntu release: "
+                                "'{}' in '{}'".format(
+                                    _rel,
+                                    _u
+                                )
+                            )
+                        else:
+                            _rel_u = _n_url(_u) + _rel
+                            _repo[_stag][_rel] = []
+                            _path_list = self.search_pkg(_n_url(_rel_u), [])
+                            self._map_repo(
+                                _path_list,
+                                _repo[_stag][_rel]
+                            )
+                            logger.info(
+                                "-> found {} dists for '{}'".format(
+                                    len(_repo[_stag][_rel]),
+                                    _rel
+                                )
+                            )
+                _progress.write_progress(_index)
+
+            _progress.end()
+            _name = _label + ext
+            _repotgz.add_file(_name, buf=json.dumps(_repo, indent=2))
+            logger_cli.info(
+                "-> archive '{}' updated with '{}'".format(
+                    self._repofile,
+                    _name
+                )
+            )
+
+        return
+
+    def list_tags(self):
+        _files = TGZFile(self._repofile).list_files()
+        # all files in archive with no '.json' part
+        _all = set([f.rsplit('.', 1)[0] for f in _files])
+        # files that ends with '.update'
+        _updates = set([f for f in _all if f.find('update') >= 0])
+        # files that ends with '.hotfix'
+        _hotfix = set([f for f in _all if f.find('hotfix') >= 0])
+        # remove updates and hotfix tags from all. The true magic of SETs
+        _all = _all - _updates - _hotfix
+        # cut updates and hotfix endings
+        _updates = [f.rsplit('.', 1)[0] for f in _updates]
+        _hotfix = [f.rsplit('.', 1)[0] for f in _hotfix]
+
+        return _all, _updates, _hotfix
+
+    def get_repoinfo(self, tag):
+        _tgz = TGZFile(self._repofile)
+        _buf = _tgz.get_file(tag + ext)
+        return json.loads(_buf)
+
+
+class RepoManager(object):
+    # files in archive
+    _repoindexfile = _repos_index_filename
+    _versionsfile = _repos_versions_filename
+    # archives
+    _versions_arch = os.path.join(pkg_dir, "versions", _repos_versions_archive)
+    _desc_arch = os.path.join(pkg_dir, "versions", _pkg_desc_archive)
+
+    # repository index
+    _repo_index = {}
+
+    # init package versions storage
+    _versions = {}
+
+    def __init__(self):
+        # Init version files
+        self.versionstgz = TGZFile(
+            self._versions_arch,
+            label="MCP Configuration Checker: Package versions archive"
+        )
+        self.desctgz = TGZFile(
+            self._desc_arch,
+            label="MCP Configuration Checker: Package descriptions archive"
+        )
+
+        if self._versionsfile in self.versionstgz.list_files():
+            logger_cli.info(
+                "# Loading versions '{}':'{}'".format(
+                    self._versions_arch,
+                    self._versionsfile
+                )
+            )
+            self._versions = json.loads(
+                self.versionstgz.get_file(self._versionsfile)
+            )
+
+        if self._repoindexfile in self.versionstgz.list_files():
+            self._repo_index = json.loads(
+                self.versionstgz.get_file(
+                    self._repoindexfile
+                )
+            )
+
+    def _create_repo_header(self, p):
+        _header = "_".join([
+            p['tag'],
+            p['subset'],
+            p['release'],
+            p['ubuntu-release'],
+            p['type'],
+            p['arch']
+        ])
+        if not filter(
+            lambda i: self._repo_index[i]["header"] == _header,
+            self._repo_index
+        ):
+            _index = str(len(self._repo_index.keys()) + 1)
+            self._repo_index[_index] = {
+                "header": _header,
+                "props": p
+            }
+        else:
+            for _k, _v in self._repo_index.iteritems():
+                if _v["header"] == _header:
+                    _index = _k
+
+        return _index
+
+    def _get_repo_header(self, index):
+        return self._repo_index[index]
+
+    def _update_pkg_version(self, n, v, md5, header_index):
+        """Method updates package version record in global dict
+        """
+        # 'if'*4 operation is pretty expensive when using it 100k in a row
+        # so try/except is a better way to go, even faster than 'reduce'
+        vs = self._versions
+        try:
+            # try to load list
+            _list = vs[n][v][md5]
+            # cast it as set() and union()
+            _list = set(_list).union([header_index])
+            # cast back as set() is not serializeable
+            vs[n][v][md5] = list(_list)
+            return False
+        except KeyError:
+            # ok, this is fresh pkg. Do it slow way.
+            if n in vs:
+                # there is such pkg already
+                if v in vs[n]:
+                    # there is such version, check md5
+                    if md5 in vs[n][v]:
+                        # just add new repo header
+                        if header_index not in vs[n][v][md5]:
+                            vs[n][v][md5].append(header_index)
+                    else:
+                        # check if such index is here...
+                        _existing = filter(
+                            lambda i: header_index in vs[n][v][i],
+                            vs[n][v]
+                        )
+                        if _existing:
+                            # Yuck! Same version had different MD5
+                            logger_cli.error(
+                                "# ERROR: Package version has multiple MD5s "
+                                "in '{}': {}:{}:{}".format(
+                                    self._get_repo_header(
+                                        header_index
+                                    )["header"],
+                                    n,
+                                    v,
+                                    md5
+                                )
+                            )
+                        vs[n][v][md5] = [header_index]
+                else:
+                    # this is new version for existing package
+                    vs[n][v] = {
+                        md5: [header_index]
+                    }
+                return False
+            else:
+                # this is new pakcage
+                vs[n] = {
+                    v: {
+                        md5: [header_index]
+                    }
+                }
+                return True
+
+    def _save_repo_descriptions(self, repo_props, desc):
+        # form the filename for the repo and save it
+        self.desctgz.add_file(
+            self._create_repo_header(repo_props),
+            json.dumps(desc)
+        )
+
+    # def get_description(self, repo_props, name, md5=None):
+    #     """Gets target description
+    #     """
+    #     _filename = self._create_repo_header(repo_props)
+    #     # check if it is present in cache
+    #     if _filename in self._desc_cache:
+    #         _descs = self._desc_cache[_filename]
+    #     else:
+    #         # load data
+    #         _descs = self.desctgz.get_file(_filename)
+    #         # Serialize it
+    #         _descs = json.loads(_descs)
+    #         self._desc_cache[_filename] = _descs
+    #     # return target desc
+    #     if name in _descs and md5 in _descs[name]:
+    #         return _descs[name][md5]
+    #     else:
+    #         return None
+
+    def parse_tag(self, tag, descriptions=False):
+        """Download and parse Package.gz files for specific tag
+        By default, descriptions not saved
+        due to huge resulting file size and slow processing
+        """
+        # init gzip and downloader
+        _info = ReposInfo().get_repoinfo(tag)
+        # calculate Packages.gz files to process
+        _baseurl = _info.pop("baseurl")
+        _total_components = len(_info.keys()) - 1
+        _ubuntu_package_repos = 0
+        _other_repos = 0
+        for _c, _d in _info.iteritems():
+            for _ur, _l in _d.iteritems():
+                if _ur in ubuntu_releases:
+                    _ubuntu_package_repos += len(_l)
+                elif _ur != 'url':
+                    _other_repos += len(_l)
+        logger_cli.info(
+            "-> loaded repository info for '{}'.\n"
+            "  '{}', {} components, {} ubuntu repos, {} other/uknown".format(
+                _baseurl,
+                tag,
+                _total_components,
+                _ubuntu_package_repos,
+                _other_repos
+            )
+        )
+        # init progress bar
+        _progress = Progress(_ubuntu_package_repos)
+        _index = 0
+        _processed = 0
+        _new = 0
+        for _c, _d in _info.iteritems():
+            # we do not need url here, just get rid of it
+            if 'url' in _d:
+                _d.pop('url')
+            # _url =  if 'url' in _d else _baseurl + _c
+            for _ur, _l in _d.iteritems():
+                # iterate package collections
+                for _p in _l:
+                    # descriptions
+                    if descriptions:
+                        _descriptions = {}
+                    # download and unzip
+                    _progress.write_progress(
+                        _index,
+                        note="/ {} {} {} {} {}, {}/{}".format(
+                            _c,
+                            _ur,
+                            _p['ubuntu-release'],
+                            _p['type'],
+                            _p['arch'],
+                            _processed,
+                            _new
+                        )
+                    )
+                    _raw = get_gzipped_file(_p['filepath'])
+                    _lines = _raw.splitlines()
+                    _index += 1
+                    # break lines collection into isolated pkg data
+                    _pkg = {
+                        "tag": tag,
+                        "subset": _c,
+                        "release": _ur
+                    }
+                    _pkg.update(_p)
+                    _desc = {}
+                    _key = _value = ""
+                    for _line in _lines:
+                        if not _line:
+                            # if the line is empty, process pkg data gathered
+                            _name = _desc['package']
+                            _md5 = _desc['md5sum']
+                            _version = _desc['version']
+                            _pkg['md5'] = _md5
+                            # update version for a package
+                            if self._update_pkg_version(
+                                _name,
+                                _version,
+                                _md5,
+                                self._create_repo_header(_pkg)
+                            ):
+                                _new += 1
+
+                            if descriptions:
+                                _d_new = {
+                                    _md5: deepcopy(_desc)
+                                }
+                                try:
+                                    _descriptions[_name].update(_d_new)
+                                except KeyError:
+                                    _descriptions[_name] = _d_new
+                            # clear the data for next pkg
+                            _processed += 1
+                            _desc = {}
+                            _key = ""
+                            _value = ""
+                        elif _line.startswith(' '):
+                            _desc[_key] += "\n{}".format(_line)
+                        else:
+                            _key, _value = _line.split(': ', 1)
+                            _key = _key.lower()
+
+                            _desc[_key] = _value
+                    # save descriptions if needed
+                    if descriptions:
+                        _progress.clearline()
+                        self._save_repo_descriptions(_pkg, _descriptions)
+
+        _progress.end()
+        # backup headers to disk
+        self.versionstgz.add_file(
+            self._repoindexfile,
+            json.dumps(self._repo_index),
+            replace=True
+        )
+        return
+
+    def fetch_versions(self, tag, descriptions=False):
+        """Executes parsing for specific tag
+        """
+        if descriptions:
+            logger_cli.warning(
+                "\n\n# !!! WARNING: Saving repo descriptions "
+                "consumes huge amount of disk space\n\n"
+            )
+        # if there is no such tag, parse it from repoinfo
+        _f = self._versionsfile
+        logger_cli.info("# Fetching versions for {}".format(tag))
+        self.parse_tag(tag, descriptions=descriptions)
+        logger_cli.info("-> saving updated versions to {}".format(_f))
+        self.versionstgz.add_file(_f, json.dumps(self._versions), replace=True)
+
+    def build_repos(self, url, tag=None):
+        """Builds versions data for selected tag, or for all of them
+        """
+        # Init the ReposInfo class and check if all files are present
+        _repos = ReposInfo()
+        # recoursively walk the mirrors
+        # and gather all of the repos for 'tag' or all of the tags
+        _repos.fetch_repos(url, tag=tag)
+
+    def action_for_tag(
+        self,
+        url,
+        tag,
+        action=None,
+        descriptions=None
+    ):
+        """Executes action for every tag from all collections
+        """
+        if not action:
+            logger_cli.info("# No action set, nothing to do")
+        # get all tags
+        major, updates, hotfix = ReposInfo().list_tags()
+        if action == "list":
+            logger_cli.info("# Tags available at '{}':".format(url))
+            for t in major:
+                logger_cli.info("\t{}".format(t))
+            for t in updates:
+                logger_cli.info("\t{} [updates]".format(t))
+            for t in hotfix:
+                logger_cli.info("\t{} [hotfix]".format(t))
+            return
+        # Pupulate action tags
+        _action_tags = []
+        if tag in major:
+            _action_tags.append(tag)
+        elif tag in updates:
+            _action_tags.append(tag + ".update")
+        elif tag in hotfix:
+            _action_tags.append(tag + ".hotfix")
+
+        if not _action_tags:
+            logger_cli.info(
+                "# Tag of '{}' not found. "
+                "Consider rebuilding repos info.".format(tag)
+            )
+        elif action == "build":
+            logger_cli.info(
+                "-> tags to build {}".format(", ".join(_action_tags))
+            )
+            for t in _action_tags:
+                logger_cli.info(
+                    "# Building repo info for '{}/{}'".format(
+                        url,
+                        tag
+                    )
+                )
+                self.build_repos(url, tag=tag)
+        elif action == "fetch":
+            logger_cli.info(
+                "-> fetching versions for tags {}".format(
+                    ", ".join(_action_tags)
+                )
+            )
+            for t in _action_tags:
+                self.fetch_versions(t, descriptions=descriptions)
+
+        logger_cli.info("# Done.")
+
+    def parse_repos(self):
+        # all tags to check
+        major, updates, hotfix = ReposInfo().list_tags()
+
+        # major tags
+        logger_cli.info("# Processing major tags")
+        for _tag in major:
+            self.fetch_versions(_tag)
+
+        # updates tags
+        logger_cli.info("# Processing update tags")
+        for _tag in updates:
+            self.fetch_versions(_tag + ".update")
+
+        # hotfix tags
+        logger_cli.info("# Processing hotfix tags")
+        for _tag in hotfix:
+            self.fetch_versions(_tag + ".hotfix")
diff --git a/cfg_checker/modules/packages/versions.py b/cfg_checker/modules/packages/versions.py
index 9737d80..a2bd083 100644
--- a/cfg_checker/modules/packages/versions.py
+++ b/cfg_checker/modules/packages/versions.py
@@ -19,7 +19,11 @@
     def __init__(self):
         # preload csv file
         logger_cli.info("# Preloading MCP release versions")
-        with open(os.path.join(pkg_dir, 'etc', config.pkg_versions_map)) as f:
+        with open(os.path.join(
+            pkg_dir,
+            'versions',
+            config.pkg_versions_map)
+        ) as f:
             _reader = csv.reader(f, delimiter=',')
             # load packages
             for row in _reader: