| import json |
| import os |
| from copy import deepcopy |
| |
| from cfg_checker.common import logger, logger_cli |
| from cfg_checker.common.const import _pkg_desc_archive |
| from cfg_checker.common.const import _repos_index_filename |
| from cfg_checker.common.const import _repos_info_archive |
| from cfg_checker.common.const import _repos_versions_archive |
| from cfg_checker.common.const import _repos_versions_filename |
| from cfg_checker.common.const import ubuntu_releases |
| from cfg_checker.common.file_utils import get_gzipped_file |
| from cfg_checker.common.settings import pkg_dir |
| from cfg_checker.helpers.console_utils import Progress |
| from cfg_checker.helpers.tgz import TGZFile |
| |
| import requests |
| from requests.exceptions import ConnectionError |
| |
| ext = ".json" |
| |
| |
| def _n_url(url): |
| if url[-1] == '/': |
| return url |
| else: |
| return url + '/' |
| |
| |
| class ReposInfo(object): |
| repos = [] |
| _repofile = os.path.join(pkg_dir, "versions", _repos_info_archive) |
| |
| @staticmethod |
| def _ls_repo_page(url): |
| # Yes, this is ugly. But it works ok for small HTMLs. |
| _a = "<a" |
| _s = "href=" |
| _e = "\">" |
| try: |
| page = requests.get(url, timeout=60) |
| except ConnectionError as e: |
| logger_cli.error("# ERROR: {}".format(e.message)) |
| return [], [] |
| a = page.text.splitlines() |
| # Comprehension for dirs. Anchors for ends with '-' |
| _dirs = [l[l.index(_s)+6:l.index(_e)-1] |
| for l in a if l.startswith(_a) and l.endswith('-')] |
| # Comprehension for files. Anchors ends with size |
| _files = [l[l.index(_s)+6:l.index(_e)] |
| for l in a if l.startswith(_a) and not l.endswith('-')] |
| |
| return _dirs, _files |
| |
| def search_pkg(self, url, _list): |
| # recoursive method to walk dists tree |
| _dirs, _files = self._ls_repo_page(url) |
| |
| for _d in _dirs: |
| # Search only in dists, ignore the rest |
| if "dists" not in url and _d != "dists": |
| continue |
| _u = _n_url(url + _d) |
| self.search_pkg(_u, _list) |
| |
| for _f in _files: |
| if _f == "Packages.gz": |
| _list.append(url + _f) |
| logger.debug("... [F] '{}'".format(url + _f)) |
| |
| return _list |
| |
| @staticmethod |
| def _map_repo(_path_list, _r): |
| for _pkg_path in _path_list: |
| _l = _pkg_path.split('/') |
| _kw = _l[_l.index('dists')+1:] |
| _kw.reverse() |
| _repo_item = { |
| "arch": _kw[1][7:] if "binary" in _kw[1] else _kw[1], |
| "type": _kw[2], |
| "ubuntu-release": _kw[3], |
| "filepath": _pkg_path |
| } |
| _r.append(_repo_item) |
| |
| def _find_tag(self, _t, _u, label=""): |
| if label: |
| _url = _n_url(_u + label) |
| _label = _t + '.' + label |
| else: |
| _url = _u |
| _label = _t |
| _ts, _ = self._ls_repo_page(_url) |
| if _t in _ts: |
| logger.debug( |
| "... found tag '{}' at '{}'".format( |
| _t, |
| _url |
| ) |
| ) |
| return { |
| _label: { |
| "baseurl": _n_url(_url + _t), |
| "all": {} |
| } |
| } |
| else: |
| return {} |
| |
| def fetch_repos(self, url, tag=None): |
| base_url = _n_url(url) |
| logger_cli.info("# Using '{}' as a repos source".format(base_url)) |
| |
| logger_cli.info("# Gathering repos info (i.e. links to 'packages.gz')") |
| # init repoinfo archive |
| _repotgz = TGZFile(self._repofile) |
| # prepare repo links |
| _repos = {} |
| if tag: |
| # only one tag to process |
| _repos.update(self._find_tag(tag, base_url)) |
| _repos.update(self._find_tag(tag, base_url, label="hotfix")) |
| _repos.update(self._find_tag(tag, base_url, label="update")) |
| else: |
| # gather all of them |
| _tags, _ = self._ls_repo_page(base_url) |
| _tags.remove('hotfix') |
| _tags.remove('update') |
| # search tags in subfolders |
| _h_tags, _ = self._ls_repo_page(base_url + 'hotfix') |
| _u_tags, _ = self._ls_repo_page(base_url + 'update') |
| _tags.extend([t for t in _h_tags if t not in _tags]) |
| _tags.extend([t for t in _u_tags if t not in _tags]) |
| _progress = Progress(len(_tags)) |
| _index = 0 |
| for _tag in _tags: |
| _repos.update(self._find_tag(_tag, base_url)) |
| _repos.update(self._find_tag(_tag, base_url, label="hotfix")) |
| _repos.update(self._find_tag(_tag, base_url, label="update")) |
| _index += 1 |
| _progress.write_progress(_index) |
| _progress.end() |
| |
| # parse subtags |
| for _label in _repos.keys(): |
| logger_cli.info("-> processing tag '{}'".format(_label)) |
| _name = _label + ".json" |
| if _repotgz.has_file(_name): |
| logger_cli.info( |
| "-> skipping, '{}' already has '{}'".format( |
| _repos_info_archive, |
| _name |
| ) |
| ) |
| continue |
| # process the tag |
| _repo = _repos[_label] |
| _baseurl = _repos[_label]["baseurl"] |
| # get the subtags |
| _sub_tags, _ = self._ls_repo_page(_baseurl) |
| _total_index = len(_sub_tags) |
| _index = 0 |
| _progress = Progress(_total_index) |
| logger.debug( |
| "... found {} subtags for '{}'".format( |
| len(_sub_tags), |
| _label |
| ) |
| ) |
| # save the url and start search |
| for _stag in _sub_tags: |
| _u = _baseurl + _stag |
| _index += 1 |
| logger.debug( |
| "... searching repos in '{}/{}'".format( |
| _label, |
| _stag |
| ) |
| ) |
| |
| # Searching Package collections |
| if _stag in ubuntu_releases: |
| # if stag is the release, this is all packages |
| _repo["all"][_stag] = [] |
| _repo["all"]["url"] = _n_url(_u) |
| _path_list = self.search_pkg(_n_url(_u), []) |
| self._map_repo(_path_list, _repo["all"][_stag]) |
| logger.info( |
| "-> found {} dists".format( |
| len(_repo["all"][_stag]) |
| ) |
| ) |
| |
| else: |
| # each subtag might have any ubuntu release |
| # so iterate them |
| _repo[_stag] = { |
| "url": _n_url(_u) |
| } |
| _releases, _ = self._ls_repo_page(_n_url(_u)) |
| for _rel in _releases: |
| if _rel not in ubuntu_releases: |
| logger.debug( |
| "... skipped unknown ubuntu release: " |
| "'{}' in '{}'".format( |
| _rel, |
| _u |
| ) |
| ) |
| else: |
| _rel_u = _n_url(_u) + _rel |
| _repo[_stag][_rel] = [] |
| _path_list = self.search_pkg(_n_url(_rel_u), []) |
| self._map_repo( |
| _path_list, |
| _repo[_stag][_rel] |
| ) |
| logger.info( |
| "-> found {} dists for '{}'".format( |
| len(_repo[_stag][_rel]), |
| _rel |
| ) |
| ) |
| _progress.write_progress(_index) |
| |
| _progress.end() |
| _name = _label + ext |
| _repotgz.add_file(_name, buf=json.dumps(_repo, indent=2)) |
| logger_cli.info( |
| "-> archive '{}' updated with '{}'".format( |
| self._repofile, |
| _name |
| ) |
| ) |
| |
| return |
| |
| def list_tags(self): |
| _files = TGZFile(self._repofile).list_files() |
| # all files in archive with no '.json' part |
| _all = set([f.rsplit('.', 1)[0] for f in _files]) |
| # files that ends with '.update' |
| _updates = set([f for f in _all if f.find('update') >= 0]) |
| # files that ends with '.hotfix' |
| _hotfix = set([f for f in _all if f.find('hotfix') >= 0]) |
| # remove updates and hotfix tags from all. The true magic of SETs |
| _all = _all - _updates - _hotfix |
| # cut updates and hotfix endings |
| _updates = [f.rsplit('.', 1)[0] for f in _updates] |
| _hotfix = [f.rsplit('.', 1)[0] for f in _hotfix] |
| |
| return _all, _updates, _hotfix |
| |
| def get_repoinfo(self, tag): |
| _tgz = TGZFile(self._repofile) |
| _buf = _tgz.get_file(tag + ext) |
| return json.loads(_buf) |
| |
| |
| class RepoManager(object): |
| # files in archive |
| _repoindexfile = _repos_index_filename |
| _versionsfile = _repos_versions_filename |
| # archives |
| _versions_arch = os.path.join(pkg_dir, "versions", _repos_versions_archive) |
| _desc_arch = os.path.join(pkg_dir, "versions", _pkg_desc_archive) |
| |
| # repository index |
| _repo_index = {} |
| |
| # init package versions storage |
| _versions = {} |
| |
| def __init__(self): |
| # Init version files |
| self.versionstgz = TGZFile( |
| self._versions_arch, |
| label="MCP Configuration Checker: Package versions archive" |
| ) |
| self.desctgz = TGZFile( |
| self._desc_arch, |
| label="MCP Configuration Checker: Package descriptions archive" |
| ) |
| |
| if self._versionsfile in self.versionstgz.list_files(): |
| logger_cli.info( |
| "# Loading versions '{}':'{}'".format( |
| self._versions_arch, |
| self._versionsfile |
| ) |
| ) |
| self._versions = json.loads( |
| self.versionstgz.get_file(self._versionsfile) |
| ) |
| |
| if self._repoindexfile in self.versionstgz.list_files(): |
| self._repo_index = json.loads( |
| self.versionstgz.get_file( |
| self._repoindexfile |
| ) |
| ) |
| |
| def _create_repo_header(self, p): |
| _header = "_".join([ |
| p['tag'], |
| p['subset'], |
| p['release'], |
| p['ubuntu-release'], |
| p['type'], |
| p['arch'] |
| ]) |
| if not filter( |
| lambda i: self._repo_index[i]["header"] == _header, |
| self._repo_index |
| ): |
| _index = str(len(self._repo_index.keys()) + 1) |
| self._repo_index[_index] = { |
| "header": _header, |
| "props": p |
| } |
| else: |
| for _k, _v in self._repo_index.iteritems(): |
| if _v["header"] == _header: |
| _index = _k |
| |
| return _index |
| |
| def _get_repo_header(self, index): |
| return self._repo_index[index] |
| |
| def _update_pkg_version(self, n, v, md5, header_index): |
| """Method updates package version record in global dict |
| """ |
| # 'if'*4 operation is pretty expensive when using it 100k in a row |
| # so try/except is a better way to go, even faster than 'reduce' |
| vs = self._versions |
| try: |
| # try to load list |
| _list = vs[n][v][md5] |
| # cast it as set() and union() |
| _list = set(_list).union([header_index]) |
| # cast back as set() is not serializeable |
| vs[n][v][md5] = list(_list) |
| return False |
| except KeyError: |
| # ok, this is fresh pkg. Do it slow way. |
| if n in vs: |
| # there is such pkg already |
| if v in vs[n]: |
| # there is such version, check md5 |
| if md5 in vs[n][v]: |
| # just add new repo header |
| if header_index not in vs[n][v][md5]: |
| vs[n][v][md5].append(header_index) |
| else: |
| # check if such index is here... |
| _existing = filter( |
| lambda i: header_index in vs[n][v][i], |
| vs[n][v] |
| ) |
| if _existing: |
| # Yuck! Same version had different MD5 |
| logger_cli.error( |
| "# ERROR: Package version has multiple MD5s " |
| "in '{}': {}:{}:{}".format( |
| self._get_repo_header( |
| header_index |
| )["header"], |
| n, |
| v, |
| md5 |
| ) |
| ) |
| vs[n][v][md5] = [header_index] |
| else: |
| # this is new version for existing package |
| vs[n][v] = { |
| md5: [header_index] |
| } |
| return False |
| else: |
| # this is new pakcage |
| vs[n] = { |
| v: { |
| md5: [header_index] |
| } |
| } |
| return True |
| |
| def _save_repo_descriptions(self, repo_props, desc): |
| # form the filename for the repo and save it |
| self.desctgz.add_file( |
| self._create_repo_header(repo_props), |
| json.dumps(desc) |
| ) |
| |
| # def get_description(self, repo_props, name, md5=None): |
| # """Gets target description |
| # """ |
| # _filename = self._create_repo_header(repo_props) |
| # # check if it is present in cache |
| # if _filename in self._desc_cache: |
| # _descs = self._desc_cache[_filename] |
| # else: |
| # # load data |
| # _descs = self.desctgz.get_file(_filename) |
| # # Serialize it |
| # _descs = json.loads(_descs) |
| # self._desc_cache[_filename] = _descs |
| # # return target desc |
| # if name in _descs and md5 in _descs[name]: |
| # return _descs[name][md5] |
| # else: |
| # return None |
| |
| def parse_tag(self, tag, descriptions=False): |
| """Download and parse Package.gz files for specific tag |
| By default, descriptions not saved |
| due to huge resulting file size and slow processing |
| """ |
| # init gzip and downloader |
| _info = ReposInfo().get_repoinfo(tag) |
| # calculate Packages.gz files to process |
| _baseurl = _info.pop("baseurl") |
| _total_components = len(_info.keys()) - 1 |
| _ubuntu_package_repos = 0 |
| _other_repos = 0 |
| for _c, _d in _info.iteritems(): |
| for _ur, _l in _d.iteritems(): |
| if _ur in ubuntu_releases: |
| _ubuntu_package_repos += len(_l) |
| elif _ur != 'url': |
| _other_repos += len(_l) |
| logger_cli.info( |
| "-> loaded repository info for '{}'.\n" |
| " '{}', {} components, {} ubuntu repos, {} other/uknown".format( |
| _baseurl, |
| tag, |
| _total_components, |
| _ubuntu_package_repos, |
| _other_repos |
| ) |
| ) |
| # init progress bar |
| _progress = Progress(_ubuntu_package_repos) |
| _index = 0 |
| _processed = 0 |
| _new = 0 |
| for _c, _d in _info.iteritems(): |
| # we do not need url here, just get rid of it |
| if 'url' in _d: |
| _d.pop('url') |
| # _url = if 'url' in _d else _baseurl + _c |
| for _ur, _l in _d.iteritems(): |
| # iterate package collections |
| for _p in _l: |
| # descriptions |
| if descriptions: |
| _descriptions = {} |
| # download and unzip |
| _progress.write_progress( |
| _index, |
| note="/ {} {} {} {} {}, {}/{}".format( |
| _c, |
| _ur, |
| _p['ubuntu-release'], |
| _p['type'], |
| _p['arch'], |
| _processed, |
| _new |
| ) |
| ) |
| _raw = get_gzipped_file(_p['filepath']) |
| _lines = _raw.splitlines() |
| _index += 1 |
| # break lines collection into isolated pkg data |
| _pkg = { |
| "tag": tag, |
| "subset": _c, |
| "release": _ur |
| } |
| _pkg.update(_p) |
| _desc = {} |
| _key = _value = "" |
| for _line in _lines: |
| if not _line: |
| # if the line is empty, process pkg data gathered |
| _name = _desc['package'] |
| _md5 = _desc['md5sum'] |
| _version = _desc['version'] |
| _pkg['md5'] = _md5 |
| # update version for a package |
| if self._update_pkg_version( |
| _name, |
| _version, |
| _md5, |
| self._create_repo_header(_pkg) |
| ): |
| _new += 1 |
| |
| if descriptions: |
| _d_new = { |
| _md5: deepcopy(_desc) |
| } |
| try: |
| _descriptions[_name].update(_d_new) |
| except KeyError: |
| _descriptions[_name] = _d_new |
| # clear the data for next pkg |
| _processed += 1 |
| _desc = {} |
| _key = "" |
| _value = "" |
| elif _line.startswith(' '): |
| _desc[_key] += "\n{}".format(_line) |
| else: |
| _key, _value = _line.split(': ', 1) |
| _key = _key.lower() |
| |
| _desc[_key] = _value |
| # save descriptions if needed |
| if descriptions: |
| _progress.clearline() |
| self._save_repo_descriptions(_pkg, _descriptions) |
| |
| _progress.end() |
| # backup headers to disk |
| self.versionstgz.add_file( |
| self._repoindexfile, |
| json.dumps(self._repo_index), |
| replace=True |
| ) |
| return |
| |
| def fetch_versions(self, tag, descriptions=False): |
| """Executes parsing for specific tag |
| """ |
| if descriptions: |
| logger_cli.warning( |
| "\n\n# !!! WARNING: Saving repo descriptions " |
| "consumes huge amount of disk space\n\n" |
| ) |
| # if there is no such tag, parse it from repoinfo |
| _f = self._versionsfile |
| logger_cli.info("# Fetching versions for {}".format(tag)) |
| self.parse_tag(tag, descriptions=descriptions) |
| logger_cli.info("-> saving updated versions to {}".format(_f)) |
| self.versionstgz.add_file(_f, json.dumps(self._versions), replace=True) |
| |
| def build_repos(self, url, tag=None): |
| """Builds versions data for selected tag, or for all of them |
| """ |
| # Init the ReposInfo class and check if all files are present |
| _repos = ReposInfo() |
| # recoursively walk the mirrors |
| # and gather all of the repos for 'tag' or all of the tags |
| _repos.fetch_repos(url, tag=tag) |
| |
| def action_for_tag( |
| self, |
| url, |
| tag, |
| action=None, |
| descriptions=None |
| ): |
| """Executes action for every tag from all collections |
| """ |
| if not action: |
| logger_cli.info("# No action set, nothing to do") |
| # get all tags |
| major, updates, hotfix = ReposInfo().list_tags() |
| if action == "list": |
| logger_cli.info("# Tags available at '{}':".format(url)) |
| for t in major: |
| logger_cli.info("\t{}".format(t)) |
| for t in updates: |
| logger_cli.info("\t{} [updates]".format(t)) |
| for t in hotfix: |
| logger_cli.info("\t{} [hotfix]".format(t)) |
| return |
| # Pupulate action tags |
| _action_tags = [] |
| if tag in major: |
| _action_tags.append(tag) |
| elif tag in updates: |
| _action_tags.append(tag + ".update") |
| elif tag in hotfix: |
| _action_tags.append(tag + ".hotfix") |
| |
| if not _action_tags: |
| logger_cli.info( |
| "# Tag of '{}' not found. " |
| "Consider rebuilding repos info.".format(tag) |
| ) |
| elif action == "build": |
| logger_cli.info( |
| "-> tags to build {}".format(", ".join(_action_tags)) |
| ) |
| for t in _action_tags: |
| logger_cli.info( |
| "# Building repo info for '{}/{}'".format( |
| url, |
| tag |
| ) |
| ) |
| self.build_repos(url, tag=tag) |
| elif action == "fetch": |
| logger_cli.info( |
| "-> fetching versions for tags {}".format( |
| ", ".join(_action_tags) |
| ) |
| ) |
| for t in _action_tags: |
| self.fetch_versions(t, descriptions=descriptions) |
| |
| logger_cli.info("# Done.") |
| |
| def parse_repos(self): |
| # all tags to check |
| major, updates, hotfix = ReposInfo().list_tags() |
| |
| # major tags |
| logger_cli.info("# Processing major tags") |
| for _tag in major: |
| self.fetch_versions(_tag) |
| |
| # updates tags |
| logger_cli.info("# Processing update tags") |
| for _tag in updates: |
| self.fetch_versions(_tag + ".update") |
| |
| # hotfix tags |
| logger_cli.info("# Processing hotfix tags") |
| for _tag in hotfix: |
| self.fetch_versions(_tag + ".hotfix") |