Automated PaŃkage versions update for tags
Module repos.py
- ReposInfo(), walks mirror using HTTP and builds
all repos map available
- RepoManager(), using repos map builds package versions map
either for specific tags or for all of them
Fixes:
- Progress class clears line remainder on change
Utils:
- Download GZ file into memory
- TGZ file interface, CRU (no deletion)
Change-Id: Ifdb37aa4b68fb25f642b2089cf16cd242ed25a0b
Related-PROD: PROD-28199
diff --git a/cfg_checker/modules/packages/__init__.py b/cfg_checker/modules/packages/__init__.py
index a4a81bd..f88ce51 100644
--- a/cfg_checker/modules/packages/__init__.py
+++ b/cfg_checker/modules/packages/__init__.py
@@ -1,4 +1,5 @@
from cfg_checker.helpers import args_utils
+from cfg_checker.modules.packages.repos import RepoManager
import checker
@@ -29,6 +30,36 @@
help="CSV filename to save report"
)
+ pkg_repos = pkg_subparsers.add_parser(
+ 'versions',
+ help="Parse versions at given URL and create local map"
+ )
+ pkg_repos.add_argument(
+ '--list-tags',
+ action="store_true", default=False,
+ help="Just list tags available in mirror"
+ )
+ pkg_repos.add_argument(
+ '--url',
+ metavar='repo_url', default="http://mirror.mirantis.com",
+ help="URL for repos, default: http://mirror.mirantis.com"
+ )
+ pkg_repos.add_argument(
+ '--tag',
+ metavar='repo_tag', default=None,
+ help="Repository tag to process packages from. Default: "
+ "All url's root folder tags"
+ )
+ pkg_repos.add_argument(
+ '--build-repos',
+ action="store_true", default=False,
+ help="Conduct build stage before working with tags"
+ )
+ pkg_repos.add_argument(
+ '--gen-desc',
+ action="store_true", default=False,
+ help="Save pkg descriptions while parsing"
+ )
return _parser
@@ -48,3 +79,32 @@
pChecker.collect_packages()
# report it
pChecker.create_report(_filename, rtype=_type, full=args.full)
+
+
+def do_versions(args):
+ """Builds tagged repo structure and parses Packages.gz files
+
+ :args: - parser arguments
+ :return: - no return value
+ """
+ # Get the list of tags for the url
+ r = RepoManager()
+ if args.build_repos:
+ # if tag is supplied, use it
+ if args.tag:
+ r.action_for_tag(args.url, args.tag, action="build")
+ else:
+ r.build_repos(args.url)
+
+ # if tag is supplied, use it
+ if args.tag:
+ # Process only this tag
+ r.action_for_tag(
+ args.url,
+ args.tag,
+ action="fetch",
+ descriptions=args.gen_desc
+ )
+ else:
+ # All of them
+ r.parse_repos()
diff --git a/cfg_checker/modules/packages/checker.py b/cfg_checker/modules/packages/checker.py
index 0bcb1a6..514bd9c 100644
--- a/cfg_checker/modules/packages/checker.py
+++ b/cfg_checker/modules/packages/checker.py
@@ -93,7 +93,7 @@
_eo += _val['results'].keys().count(const.VERSION_ERR)
_do += _val['results'].keys().count(const.VERSION_DOWN)
- _progress.newline()
+ _progress.end()
_data['errors'] = {
'mirantis': _ec,
@@ -239,7 +239,7 @@
}
self._packages = _all_packages
- _progress.newline()
+ _progress.end()
def create_report(self, filename, rtype, full=None):
"""
diff --git a/cfg_checker/modules/packages/repos.py b/cfg_checker/modules/packages/repos.py
new file mode 100644
index 0000000..2a91ed5
--- /dev/null
+++ b/cfg_checker/modules/packages/repos.py
@@ -0,0 +1,636 @@
+import json
+import os
+from copy import deepcopy
+
+from cfg_checker.common import logger, logger_cli
+from cfg_checker.common.const import _pkg_desc_archive
+from cfg_checker.common.const import _repos_index_filename
+from cfg_checker.common.const import _repos_info_archive
+from cfg_checker.common.const import _repos_versions_archive
+from cfg_checker.common.const import _repos_versions_filename
+from cfg_checker.common.const import ubuntu_releases
+from cfg_checker.common.file_utils import get_gzipped_file
+from cfg_checker.common.settings import pkg_dir
+from cfg_checker.helpers.console_utils import Progress
+from cfg_checker.helpers.tgz import TGZFile
+
+import requests
+from requests.exceptions import ConnectionError
+
+ext = ".json"
+
+
+def _n_url(url):
+ if url[-1] == '/':
+ return url
+ else:
+ return url + '/'
+
+
+class ReposInfo(object):
+ repos = []
+ _repofile = os.path.join(pkg_dir, "versions", _repos_info_archive)
+
+ @staticmethod
+ def _ls_repo_page(url):
+ # Yes, this is ugly. But it works ok for small HTMLs.
+ _a = "<a"
+ _s = "href="
+ _e = "\">"
+ try:
+ page = requests.get(url, timeout=60)
+ except ConnectionError as e:
+ logger_cli.error("# ERROR: {}".format(e.message))
+ return [], []
+ a = page.text.splitlines()
+ # Comprehension for dirs. Anchors for ends with '-'
+ _dirs = [l[l.index(_s)+6:l.index(_e)-1]
+ for l in a if l.startswith(_a) and l.endswith('-')]
+ # Comprehension for files. Anchors ends with size
+ _files = [l[l.index(_s)+6:l.index(_e)]
+ for l in a if l.startswith(_a) and not l.endswith('-')]
+
+ return _dirs, _files
+
+ def search_pkg(self, url, _list):
+ # recoursive method to walk dists tree
+ _dirs, _files = self._ls_repo_page(url)
+
+ for _d in _dirs:
+ # Search only in dists, ignore the rest
+ if "dists" not in url and _d != "dists":
+ continue
+ _u = _n_url(url + _d)
+ self.search_pkg(_u, _list)
+
+ for _f in _files:
+ if _f == "Packages.gz":
+ _list.append(url + _f)
+ logger.debug("... [F] '{}'".format(url + _f))
+
+ return _list
+
+ @staticmethod
+ def _map_repo(_path_list, _r):
+ for _pkg_path in _path_list:
+ _l = _pkg_path.split('/')
+ _kw = _l[_l.index('dists')+1:]
+ _kw.reverse()
+ _repo_item = {
+ "arch": _kw[1][7:] if "binary" in _kw[1] else _kw[1],
+ "type": _kw[2],
+ "ubuntu-release": _kw[3],
+ "filepath": _pkg_path
+ }
+ _r.append(_repo_item)
+
+ def _find_tag(self, _t, _u, label=""):
+ if label:
+ _url = _n_url(_u + label)
+ _label = _t + '.' + label
+ else:
+ _url = _u
+ _label = _t
+ _ts, _ = self._ls_repo_page(_url)
+ if _t in _ts:
+ logger.debug(
+ "... found tag '{}' at '{}'".format(
+ _t,
+ _url
+ )
+ )
+ return {
+ _label: {
+ "baseurl": _n_url(_url + _t),
+ "all": {}
+ }
+ }
+ else:
+ return {}
+
+ def fetch_repos(self, url, tag=None):
+ base_url = _n_url(url)
+ logger_cli.info("# Using '{}' as a repos source".format(base_url))
+
+ logger_cli.info("# Gathering repos info (i.e. links to 'packages.gz')")
+ # init repoinfo archive
+ _repotgz = TGZFile(self._repofile)
+ # prepare repo links
+ _repos = {}
+ if tag:
+ # only one tag to process
+ _repos.update(self._find_tag(tag, base_url))
+ _repos.update(self._find_tag(tag, base_url, label="hotfix"))
+ _repos.update(self._find_tag(tag, base_url, label="update"))
+ else:
+ # gather all of them
+ _tags, _ = self._ls_repo_page(base_url)
+ _tags.remove('hotfix')
+ _tags.remove('update')
+ # search tags in subfolders
+ _h_tags, _ = self._ls_repo_page(base_url + 'hotfix')
+ _u_tags, _ = self._ls_repo_page(base_url + 'update')
+ _tags.extend([t for t in _h_tags if t not in _tags])
+ _tags.extend([t for t in _u_tags if t not in _tags])
+ _progress = Progress(len(_tags))
+ _index = 0
+ for _tag in _tags:
+ _repos.update(self._find_tag(_tag, base_url))
+ _repos.update(self._find_tag(_tag, base_url, label="hotfix"))
+ _repos.update(self._find_tag(_tag, base_url, label="update"))
+ _index += 1
+ _progress.write_progress(_index)
+ _progress.end()
+
+ # parse subtags
+ for _label in _repos.keys():
+ logger_cli.info("-> processing tag '{}'".format(_label))
+ _name = _label + ".json"
+ if _repotgz.has_file(_name):
+ logger_cli.info(
+ "-> skipping, '{}' already has '{}'".format(
+ _repos_info_archive,
+ _name
+ )
+ )
+ continue
+ # process the tag
+ _repo = _repos[_label]
+ _baseurl = _repos[_label]["baseurl"]
+ # get the subtags
+ _sub_tags, _ = self._ls_repo_page(_baseurl)
+ _total_index = len(_sub_tags)
+ _index = 0
+ _progress = Progress(_total_index)
+ logger.debug(
+ "... found {} subtags for '{}'".format(
+ len(_sub_tags),
+ _label
+ )
+ )
+ # save the url and start search
+ for _stag in _sub_tags:
+ _u = _baseurl + _stag
+ _index += 1
+ logger.debug(
+ "... searching repos in '{}/{}'".format(
+ _label,
+ _stag
+ )
+ )
+
+ # Searching Package collections
+ if _stag in ubuntu_releases:
+ # if stag is the release, this is all packages
+ _repo["all"][_stag] = []
+ _repo["all"]["url"] = _n_url(_u)
+ _path_list = self.search_pkg(_n_url(_u), [])
+ self._map_repo(_path_list, _repo["all"][_stag])
+ logger.info(
+ "-> found {} dists".format(
+ len(_repo["all"][_stag])
+ )
+ )
+
+ else:
+ # each subtag might have any ubuntu release
+ # so iterate them
+ _repo[_stag] = {
+ "url": _n_url(_u)
+ }
+ _releases, _ = self._ls_repo_page(_n_url(_u))
+ for _rel in _releases:
+ if _rel not in ubuntu_releases:
+ logger.debug(
+ "... skipped unknown ubuntu release: "
+ "'{}' in '{}'".format(
+ _rel,
+ _u
+ )
+ )
+ else:
+ _rel_u = _n_url(_u) + _rel
+ _repo[_stag][_rel] = []
+ _path_list = self.search_pkg(_n_url(_rel_u), [])
+ self._map_repo(
+ _path_list,
+ _repo[_stag][_rel]
+ )
+ logger.info(
+ "-> found {} dists for '{}'".format(
+ len(_repo[_stag][_rel]),
+ _rel
+ )
+ )
+ _progress.write_progress(_index)
+
+ _progress.end()
+ _name = _label + ext
+ _repotgz.add_file(_name, buf=json.dumps(_repo, indent=2))
+ logger_cli.info(
+ "-> archive '{}' updated with '{}'".format(
+ self._repofile,
+ _name
+ )
+ )
+
+ return
+
+ def list_tags(self):
+ _files = TGZFile(self._repofile).list_files()
+ # all files in archive with no '.json' part
+ _all = set([f.rsplit('.', 1)[0] for f in _files])
+ # files that ends with '.update'
+ _updates = set([f for f in _all if f.find('update') >= 0])
+ # files that ends with '.hotfix'
+ _hotfix = set([f for f in _all if f.find('hotfix') >= 0])
+ # remove updates and hotfix tags from all. The true magic of SETs
+ _all = _all - _updates - _hotfix
+ # cut updates and hotfix endings
+ _updates = [f.rsplit('.', 1)[0] for f in _updates]
+ _hotfix = [f.rsplit('.', 1)[0] for f in _hotfix]
+
+ return _all, _updates, _hotfix
+
+ def get_repoinfo(self, tag):
+ _tgz = TGZFile(self._repofile)
+ _buf = _tgz.get_file(tag + ext)
+ return json.loads(_buf)
+
+
+class RepoManager(object):
+ # files in archive
+ _repoindexfile = _repos_index_filename
+ _versionsfile = _repos_versions_filename
+ # archives
+ _versions_arch = os.path.join(pkg_dir, "versions", _repos_versions_archive)
+ _desc_arch = os.path.join(pkg_dir, "versions", _pkg_desc_archive)
+
+ # repository index
+ _repo_index = {}
+
+ # init package versions storage
+ _versions = {}
+
+ def __init__(self):
+ # Init version files
+ self.versionstgz = TGZFile(
+ self._versions_arch,
+ label="MCP Configuration Checker: Package versions archive"
+ )
+ self.desctgz = TGZFile(
+ self._desc_arch,
+ label="MCP Configuration Checker: Package descriptions archive"
+ )
+
+ if self._versionsfile in self.versionstgz.list_files():
+ logger_cli.info(
+ "# Loading versions '{}':'{}'".format(
+ self._versions_arch,
+ self._versionsfile
+ )
+ )
+ self._versions = json.loads(
+ self.versionstgz.get_file(self._versionsfile)
+ )
+
+ if self._repoindexfile in self.versionstgz.list_files():
+ self._repo_index = json.loads(
+ self.versionstgz.get_file(
+ self._repoindexfile
+ )
+ )
+
+ def _create_repo_header(self, p):
+ _header = "_".join([
+ p['tag'],
+ p['subset'],
+ p['release'],
+ p['ubuntu-release'],
+ p['type'],
+ p['arch']
+ ])
+ if not filter(
+ lambda i: self._repo_index[i]["header"] == _header,
+ self._repo_index
+ ):
+ _index = str(len(self._repo_index.keys()) + 1)
+ self._repo_index[_index] = {
+ "header": _header,
+ "props": p
+ }
+ else:
+ for _k, _v in self._repo_index.iteritems():
+ if _v["header"] == _header:
+ _index = _k
+
+ return _index
+
+ def _get_repo_header(self, index):
+ return self._repo_index[index]
+
+ def _update_pkg_version(self, n, v, md5, header_index):
+ """Method updates package version record in global dict
+ """
+ # 'if'*4 operation is pretty expensive when using it 100k in a row
+ # so try/except is a better way to go, even faster than 'reduce'
+ vs = self._versions
+ try:
+ # try to load list
+ _list = vs[n][v][md5]
+ # cast it as set() and union()
+ _list = set(_list).union([header_index])
+ # cast back as set() is not serializeable
+ vs[n][v][md5] = list(_list)
+ return False
+ except KeyError:
+ # ok, this is fresh pkg. Do it slow way.
+ if n in vs:
+ # there is such pkg already
+ if v in vs[n]:
+ # there is such version, check md5
+ if md5 in vs[n][v]:
+ # just add new repo header
+ if header_index not in vs[n][v][md5]:
+ vs[n][v][md5].append(header_index)
+ else:
+ # check if such index is here...
+ _existing = filter(
+ lambda i: header_index in vs[n][v][i],
+ vs[n][v]
+ )
+ if _existing:
+ # Yuck! Same version had different MD5
+ logger_cli.error(
+ "# ERROR: Package version has multiple MD5s "
+ "in '{}': {}:{}:{}".format(
+ self._get_repo_header(
+ header_index
+ )["header"],
+ n,
+ v,
+ md5
+ )
+ )
+ vs[n][v][md5] = [header_index]
+ else:
+ # this is new version for existing package
+ vs[n][v] = {
+ md5: [header_index]
+ }
+ return False
+ else:
+ # this is new pakcage
+ vs[n] = {
+ v: {
+ md5: [header_index]
+ }
+ }
+ return True
+
+ def _save_repo_descriptions(self, repo_props, desc):
+ # form the filename for the repo and save it
+ self.desctgz.add_file(
+ self._create_repo_header(repo_props),
+ json.dumps(desc)
+ )
+
+ # def get_description(self, repo_props, name, md5=None):
+ # """Gets target description
+ # """
+ # _filename = self._create_repo_header(repo_props)
+ # # check if it is present in cache
+ # if _filename in self._desc_cache:
+ # _descs = self._desc_cache[_filename]
+ # else:
+ # # load data
+ # _descs = self.desctgz.get_file(_filename)
+ # # Serialize it
+ # _descs = json.loads(_descs)
+ # self._desc_cache[_filename] = _descs
+ # # return target desc
+ # if name in _descs and md5 in _descs[name]:
+ # return _descs[name][md5]
+ # else:
+ # return None
+
+ def parse_tag(self, tag, descriptions=False):
+ """Download and parse Package.gz files for specific tag
+ By default, descriptions not saved
+ due to huge resulting file size and slow processing
+ """
+ # init gzip and downloader
+ _info = ReposInfo().get_repoinfo(tag)
+ # calculate Packages.gz files to process
+ _baseurl = _info.pop("baseurl")
+ _total_components = len(_info.keys()) - 1
+ _ubuntu_package_repos = 0
+ _other_repos = 0
+ for _c, _d in _info.iteritems():
+ for _ur, _l in _d.iteritems():
+ if _ur in ubuntu_releases:
+ _ubuntu_package_repos += len(_l)
+ elif _ur != 'url':
+ _other_repos += len(_l)
+ logger_cli.info(
+ "-> loaded repository info for '{}'.\n"
+ " '{}', {} components, {} ubuntu repos, {} other/uknown".format(
+ _baseurl,
+ tag,
+ _total_components,
+ _ubuntu_package_repos,
+ _other_repos
+ )
+ )
+ # init progress bar
+ _progress = Progress(_ubuntu_package_repos)
+ _index = 0
+ _processed = 0
+ _new = 0
+ for _c, _d in _info.iteritems():
+ # we do not need url here, just get rid of it
+ if 'url' in _d:
+ _d.pop('url')
+ # _url = if 'url' in _d else _baseurl + _c
+ for _ur, _l in _d.iteritems():
+ # iterate package collections
+ for _p in _l:
+ # descriptions
+ if descriptions:
+ _descriptions = {}
+ # download and unzip
+ _progress.write_progress(
+ _index,
+ note="/ {} {} {} {} {}, {}/{}".format(
+ _c,
+ _ur,
+ _p['ubuntu-release'],
+ _p['type'],
+ _p['arch'],
+ _processed,
+ _new
+ )
+ )
+ _raw = get_gzipped_file(_p['filepath'])
+ _lines = _raw.splitlines()
+ _index += 1
+ # break lines collection into isolated pkg data
+ _pkg = {
+ "tag": tag,
+ "subset": _c,
+ "release": _ur
+ }
+ _pkg.update(_p)
+ _desc = {}
+ _key = _value = ""
+ for _line in _lines:
+ if not _line:
+ # if the line is empty, process pkg data gathered
+ _name = _desc['package']
+ _md5 = _desc['md5sum']
+ _version = _desc['version']
+ _pkg['md5'] = _md5
+ # update version for a package
+ if self._update_pkg_version(
+ _name,
+ _version,
+ _md5,
+ self._create_repo_header(_pkg)
+ ):
+ _new += 1
+
+ if descriptions:
+ _d_new = {
+ _md5: deepcopy(_desc)
+ }
+ try:
+ _descriptions[_name].update(_d_new)
+ except KeyError:
+ _descriptions[_name] = _d_new
+ # clear the data for next pkg
+ _processed += 1
+ _desc = {}
+ _key = ""
+ _value = ""
+ elif _line.startswith(' '):
+ _desc[_key] += "\n{}".format(_line)
+ else:
+ _key, _value = _line.split(': ', 1)
+ _key = _key.lower()
+
+ _desc[_key] = _value
+ # save descriptions if needed
+ if descriptions:
+ _progress.clearline()
+ self._save_repo_descriptions(_pkg, _descriptions)
+
+ _progress.end()
+ # backup headers to disk
+ self.versionstgz.add_file(
+ self._repoindexfile,
+ json.dumps(self._repo_index),
+ replace=True
+ )
+ return
+
+ def fetch_versions(self, tag, descriptions=False):
+ """Executes parsing for specific tag
+ """
+ if descriptions:
+ logger_cli.warning(
+ "\n\n# !!! WARNING: Saving repo descriptions "
+ "consumes huge amount of disk space\n\n"
+ )
+ # if there is no such tag, parse it from repoinfo
+ _f = self._versionsfile
+ logger_cli.info("# Fetching versions for {}".format(tag))
+ self.parse_tag(tag, descriptions=descriptions)
+ logger_cli.info("-> saving updated versions to {}".format(_f))
+ self.versionstgz.add_file(_f, json.dumps(self._versions), replace=True)
+
+ def build_repos(self, url, tag=None):
+ """Builds versions data for selected tag, or for all of them
+ """
+ # Init the ReposInfo class and check if all files are present
+ _repos = ReposInfo()
+ # recoursively walk the mirrors
+ # and gather all of the repos for 'tag' or all of the tags
+ _repos.fetch_repos(url, tag=tag)
+
+ def action_for_tag(
+ self,
+ url,
+ tag,
+ action=None,
+ descriptions=None
+ ):
+ """Executes action for every tag from all collections
+ """
+ if not action:
+ logger_cli.info("# No action set, nothing to do")
+ # get all tags
+ major, updates, hotfix = ReposInfo().list_tags()
+ if action == "list":
+ logger_cli.info("# Tags available at '{}':".format(url))
+ for t in major:
+ logger_cli.info("\t{}".format(t))
+ for t in updates:
+ logger_cli.info("\t{} [updates]".format(t))
+ for t in hotfix:
+ logger_cli.info("\t{} [hotfix]".format(t))
+ return
+ # Pupulate action tags
+ _action_tags = []
+ if tag in major:
+ _action_tags.append(tag)
+ elif tag in updates:
+ _action_tags.append(tag + ".update")
+ elif tag in hotfix:
+ _action_tags.append(tag + ".hotfix")
+
+ if not _action_tags:
+ logger_cli.info(
+ "# Tag of '{}' not found. "
+ "Consider rebuilding repos info.".format(tag)
+ )
+ elif action == "build":
+ logger_cli.info(
+ "-> tags to build {}".format(", ".join(_action_tags))
+ )
+ for t in _action_tags:
+ logger_cli.info(
+ "# Building repo info for '{}/{}'".format(
+ url,
+ tag
+ )
+ )
+ self.build_repos(url, tag=tag)
+ elif action == "fetch":
+ logger_cli.info(
+ "-> fetching versions for tags {}".format(
+ ", ".join(_action_tags)
+ )
+ )
+ for t in _action_tags:
+ self.fetch_versions(t, descriptions=descriptions)
+
+ logger_cli.info("# Done.")
+
+ def parse_repos(self):
+ # all tags to check
+ major, updates, hotfix = ReposInfo().list_tags()
+
+ # major tags
+ logger_cli.info("# Processing major tags")
+ for _tag in major:
+ self.fetch_versions(_tag)
+
+ # updates tags
+ logger_cli.info("# Processing update tags")
+ for _tag in updates:
+ self.fetch_versions(_tag + ".update")
+
+ # hotfix tags
+ logger_cli.info("# Processing hotfix tags")
+ for _tag in hotfix:
+ self.fetch_versions(_tag + ".hotfix")
diff --git a/cfg_checker/modules/packages/versions.py b/cfg_checker/modules/packages/versions.py
index 9737d80..a2bd083 100644
--- a/cfg_checker/modules/packages/versions.py
+++ b/cfg_checker/modules/packages/versions.py
@@ -19,7 +19,11 @@
def __init__(self):
# preload csv file
logger_cli.info("# Preloading MCP release versions")
- with open(os.path.join(pkg_dir, 'etc', config.pkg_versions_map)) as f:
+ with open(os.path.join(
+ pkg_dir,
+ 'versions',
+ config.pkg_versions_map)
+ ) as f:
_reader = csv.reader(f, delimiter=',')
# load packages
for row in _reader: