blob: 2a91ed5c47c05aee151a230fb739a5806ea9d10b [file] [log] [blame]
import json
import os
from copy import deepcopy
from cfg_checker.common import logger, logger_cli
from cfg_checker.common.const import _pkg_desc_archive
from cfg_checker.common.const import _repos_index_filename
from cfg_checker.common.const import _repos_info_archive
from cfg_checker.common.const import _repos_versions_archive
from cfg_checker.common.const import _repos_versions_filename
from cfg_checker.common.const import ubuntu_releases
from cfg_checker.common.file_utils import get_gzipped_file
from cfg_checker.common.settings import pkg_dir
from cfg_checker.helpers.console_utils import Progress
from cfg_checker.helpers.tgz import TGZFile
import requests
from requests.exceptions import ConnectionError
ext = ".json"
def _n_url(url):
if url[-1] == '/':
return url
else:
return url + '/'
class ReposInfo(object):
repos = []
_repofile = os.path.join(pkg_dir, "versions", _repos_info_archive)
@staticmethod
def _ls_repo_page(url):
# Yes, this is ugly. But it works ok for small HTMLs.
_a = "<a"
_s = "href="
_e = "\">"
try:
page = requests.get(url, timeout=60)
except ConnectionError as e:
logger_cli.error("# ERROR: {}".format(e.message))
return [], []
a = page.text.splitlines()
# Comprehension for dirs. Anchors for ends with '-'
_dirs = [l[l.index(_s)+6:l.index(_e)-1]
for l in a if l.startswith(_a) and l.endswith('-')]
# Comprehension for files. Anchors ends with size
_files = [l[l.index(_s)+6:l.index(_e)]
for l in a if l.startswith(_a) and not l.endswith('-')]
return _dirs, _files
def search_pkg(self, url, _list):
# recoursive method to walk dists tree
_dirs, _files = self._ls_repo_page(url)
for _d in _dirs:
# Search only in dists, ignore the rest
if "dists" not in url and _d != "dists":
continue
_u = _n_url(url + _d)
self.search_pkg(_u, _list)
for _f in _files:
if _f == "Packages.gz":
_list.append(url + _f)
logger.debug("... [F] '{}'".format(url + _f))
return _list
@staticmethod
def _map_repo(_path_list, _r):
for _pkg_path in _path_list:
_l = _pkg_path.split('/')
_kw = _l[_l.index('dists')+1:]
_kw.reverse()
_repo_item = {
"arch": _kw[1][7:] if "binary" in _kw[1] else _kw[1],
"type": _kw[2],
"ubuntu-release": _kw[3],
"filepath": _pkg_path
}
_r.append(_repo_item)
def _find_tag(self, _t, _u, label=""):
if label:
_url = _n_url(_u + label)
_label = _t + '.' + label
else:
_url = _u
_label = _t
_ts, _ = self._ls_repo_page(_url)
if _t in _ts:
logger.debug(
"... found tag '{}' at '{}'".format(
_t,
_url
)
)
return {
_label: {
"baseurl": _n_url(_url + _t),
"all": {}
}
}
else:
return {}
def fetch_repos(self, url, tag=None):
base_url = _n_url(url)
logger_cli.info("# Using '{}' as a repos source".format(base_url))
logger_cli.info("# Gathering repos info (i.e. links to 'packages.gz')")
# init repoinfo archive
_repotgz = TGZFile(self._repofile)
# prepare repo links
_repos = {}
if tag:
# only one tag to process
_repos.update(self._find_tag(tag, base_url))
_repos.update(self._find_tag(tag, base_url, label="hotfix"))
_repos.update(self._find_tag(tag, base_url, label="update"))
else:
# gather all of them
_tags, _ = self._ls_repo_page(base_url)
_tags.remove('hotfix')
_tags.remove('update')
# search tags in subfolders
_h_tags, _ = self._ls_repo_page(base_url + 'hotfix')
_u_tags, _ = self._ls_repo_page(base_url + 'update')
_tags.extend([t for t in _h_tags if t not in _tags])
_tags.extend([t for t in _u_tags if t not in _tags])
_progress = Progress(len(_tags))
_index = 0
for _tag in _tags:
_repos.update(self._find_tag(_tag, base_url))
_repos.update(self._find_tag(_tag, base_url, label="hotfix"))
_repos.update(self._find_tag(_tag, base_url, label="update"))
_index += 1
_progress.write_progress(_index)
_progress.end()
# parse subtags
for _label in _repos.keys():
logger_cli.info("-> processing tag '{}'".format(_label))
_name = _label + ".json"
if _repotgz.has_file(_name):
logger_cli.info(
"-> skipping, '{}' already has '{}'".format(
_repos_info_archive,
_name
)
)
continue
# process the tag
_repo = _repos[_label]
_baseurl = _repos[_label]["baseurl"]
# get the subtags
_sub_tags, _ = self._ls_repo_page(_baseurl)
_total_index = len(_sub_tags)
_index = 0
_progress = Progress(_total_index)
logger.debug(
"... found {} subtags for '{}'".format(
len(_sub_tags),
_label
)
)
# save the url and start search
for _stag in _sub_tags:
_u = _baseurl + _stag
_index += 1
logger.debug(
"... searching repos in '{}/{}'".format(
_label,
_stag
)
)
# Searching Package collections
if _stag in ubuntu_releases:
# if stag is the release, this is all packages
_repo["all"][_stag] = []
_repo["all"]["url"] = _n_url(_u)
_path_list = self.search_pkg(_n_url(_u), [])
self._map_repo(_path_list, _repo["all"][_stag])
logger.info(
"-> found {} dists".format(
len(_repo["all"][_stag])
)
)
else:
# each subtag might have any ubuntu release
# so iterate them
_repo[_stag] = {
"url": _n_url(_u)
}
_releases, _ = self._ls_repo_page(_n_url(_u))
for _rel in _releases:
if _rel not in ubuntu_releases:
logger.debug(
"... skipped unknown ubuntu release: "
"'{}' in '{}'".format(
_rel,
_u
)
)
else:
_rel_u = _n_url(_u) + _rel
_repo[_stag][_rel] = []
_path_list = self.search_pkg(_n_url(_rel_u), [])
self._map_repo(
_path_list,
_repo[_stag][_rel]
)
logger.info(
"-> found {} dists for '{}'".format(
len(_repo[_stag][_rel]),
_rel
)
)
_progress.write_progress(_index)
_progress.end()
_name = _label + ext
_repotgz.add_file(_name, buf=json.dumps(_repo, indent=2))
logger_cli.info(
"-> archive '{}' updated with '{}'".format(
self._repofile,
_name
)
)
return
def list_tags(self):
_files = TGZFile(self._repofile).list_files()
# all files in archive with no '.json' part
_all = set([f.rsplit('.', 1)[0] for f in _files])
# files that ends with '.update'
_updates = set([f for f in _all if f.find('update') >= 0])
# files that ends with '.hotfix'
_hotfix = set([f for f in _all if f.find('hotfix') >= 0])
# remove updates and hotfix tags from all. The true magic of SETs
_all = _all - _updates - _hotfix
# cut updates and hotfix endings
_updates = [f.rsplit('.', 1)[0] for f in _updates]
_hotfix = [f.rsplit('.', 1)[0] for f in _hotfix]
return _all, _updates, _hotfix
def get_repoinfo(self, tag):
_tgz = TGZFile(self._repofile)
_buf = _tgz.get_file(tag + ext)
return json.loads(_buf)
class RepoManager(object):
# files in archive
_repoindexfile = _repos_index_filename
_versionsfile = _repos_versions_filename
# archives
_versions_arch = os.path.join(pkg_dir, "versions", _repos_versions_archive)
_desc_arch = os.path.join(pkg_dir, "versions", _pkg_desc_archive)
# repository index
_repo_index = {}
# init package versions storage
_versions = {}
def __init__(self):
# Init version files
self.versionstgz = TGZFile(
self._versions_arch,
label="MCP Configuration Checker: Package versions archive"
)
self.desctgz = TGZFile(
self._desc_arch,
label="MCP Configuration Checker: Package descriptions archive"
)
if self._versionsfile in self.versionstgz.list_files():
logger_cli.info(
"# Loading versions '{}':'{}'".format(
self._versions_arch,
self._versionsfile
)
)
self._versions = json.loads(
self.versionstgz.get_file(self._versionsfile)
)
if self._repoindexfile in self.versionstgz.list_files():
self._repo_index = json.loads(
self.versionstgz.get_file(
self._repoindexfile
)
)
def _create_repo_header(self, p):
_header = "_".join([
p['tag'],
p['subset'],
p['release'],
p['ubuntu-release'],
p['type'],
p['arch']
])
if not filter(
lambda i: self._repo_index[i]["header"] == _header,
self._repo_index
):
_index = str(len(self._repo_index.keys()) + 1)
self._repo_index[_index] = {
"header": _header,
"props": p
}
else:
for _k, _v in self._repo_index.iteritems():
if _v["header"] == _header:
_index = _k
return _index
def _get_repo_header(self, index):
return self._repo_index[index]
def _update_pkg_version(self, n, v, md5, header_index):
"""Method updates package version record in global dict
"""
# 'if'*4 operation is pretty expensive when using it 100k in a row
# so try/except is a better way to go, even faster than 'reduce'
vs = self._versions
try:
# try to load list
_list = vs[n][v][md5]
# cast it as set() and union()
_list = set(_list).union([header_index])
# cast back as set() is not serializeable
vs[n][v][md5] = list(_list)
return False
except KeyError:
# ok, this is fresh pkg. Do it slow way.
if n in vs:
# there is such pkg already
if v in vs[n]:
# there is such version, check md5
if md5 in vs[n][v]:
# just add new repo header
if header_index not in vs[n][v][md5]:
vs[n][v][md5].append(header_index)
else:
# check if such index is here...
_existing = filter(
lambda i: header_index in vs[n][v][i],
vs[n][v]
)
if _existing:
# Yuck! Same version had different MD5
logger_cli.error(
"# ERROR: Package version has multiple MD5s "
"in '{}': {}:{}:{}".format(
self._get_repo_header(
header_index
)["header"],
n,
v,
md5
)
)
vs[n][v][md5] = [header_index]
else:
# this is new version for existing package
vs[n][v] = {
md5: [header_index]
}
return False
else:
# this is new pakcage
vs[n] = {
v: {
md5: [header_index]
}
}
return True
def _save_repo_descriptions(self, repo_props, desc):
# form the filename for the repo and save it
self.desctgz.add_file(
self._create_repo_header(repo_props),
json.dumps(desc)
)
# def get_description(self, repo_props, name, md5=None):
# """Gets target description
# """
# _filename = self._create_repo_header(repo_props)
# # check if it is present in cache
# if _filename in self._desc_cache:
# _descs = self._desc_cache[_filename]
# else:
# # load data
# _descs = self.desctgz.get_file(_filename)
# # Serialize it
# _descs = json.loads(_descs)
# self._desc_cache[_filename] = _descs
# # return target desc
# if name in _descs and md5 in _descs[name]:
# return _descs[name][md5]
# else:
# return None
def parse_tag(self, tag, descriptions=False):
"""Download and parse Package.gz files for specific tag
By default, descriptions not saved
due to huge resulting file size and slow processing
"""
# init gzip and downloader
_info = ReposInfo().get_repoinfo(tag)
# calculate Packages.gz files to process
_baseurl = _info.pop("baseurl")
_total_components = len(_info.keys()) - 1
_ubuntu_package_repos = 0
_other_repos = 0
for _c, _d in _info.iteritems():
for _ur, _l in _d.iteritems():
if _ur in ubuntu_releases:
_ubuntu_package_repos += len(_l)
elif _ur != 'url':
_other_repos += len(_l)
logger_cli.info(
"-> loaded repository info for '{}'.\n"
" '{}', {} components, {} ubuntu repos, {} other/uknown".format(
_baseurl,
tag,
_total_components,
_ubuntu_package_repos,
_other_repos
)
)
# init progress bar
_progress = Progress(_ubuntu_package_repos)
_index = 0
_processed = 0
_new = 0
for _c, _d in _info.iteritems():
# we do not need url here, just get rid of it
if 'url' in _d:
_d.pop('url')
# _url = if 'url' in _d else _baseurl + _c
for _ur, _l in _d.iteritems():
# iterate package collections
for _p in _l:
# descriptions
if descriptions:
_descriptions = {}
# download and unzip
_progress.write_progress(
_index,
note="/ {} {} {} {} {}, {}/{}".format(
_c,
_ur,
_p['ubuntu-release'],
_p['type'],
_p['arch'],
_processed,
_new
)
)
_raw = get_gzipped_file(_p['filepath'])
_lines = _raw.splitlines()
_index += 1
# break lines collection into isolated pkg data
_pkg = {
"tag": tag,
"subset": _c,
"release": _ur
}
_pkg.update(_p)
_desc = {}
_key = _value = ""
for _line in _lines:
if not _line:
# if the line is empty, process pkg data gathered
_name = _desc['package']
_md5 = _desc['md5sum']
_version = _desc['version']
_pkg['md5'] = _md5
# update version for a package
if self._update_pkg_version(
_name,
_version,
_md5,
self._create_repo_header(_pkg)
):
_new += 1
if descriptions:
_d_new = {
_md5: deepcopy(_desc)
}
try:
_descriptions[_name].update(_d_new)
except KeyError:
_descriptions[_name] = _d_new
# clear the data for next pkg
_processed += 1
_desc = {}
_key = ""
_value = ""
elif _line.startswith(' '):
_desc[_key] += "\n{}".format(_line)
else:
_key, _value = _line.split(': ', 1)
_key = _key.lower()
_desc[_key] = _value
# save descriptions if needed
if descriptions:
_progress.clearline()
self._save_repo_descriptions(_pkg, _descriptions)
_progress.end()
# backup headers to disk
self.versionstgz.add_file(
self._repoindexfile,
json.dumps(self._repo_index),
replace=True
)
return
def fetch_versions(self, tag, descriptions=False):
"""Executes parsing for specific tag
"""
if descriptions:
logger_cli.warning(
"\n\n# !!! WARNING: Saving repo descriptions "
"consumes huge amount of disk space\n\n"
)
# if there is no such tag, parse it from repoinfo
_f = self._versionsfile
logger_cli.info("# Fetching versions for {}".format(tag))
self.parse_tag(tag, descriptions=descriptions)
logger_cli.info("-> saving updated versions to {}".format(_f))
self.versionstgz.add_file(_f, json.dumps(self._versions), replace=True)
def build_repos(self, url, tag=None):
"""Builds versions data for selected tag, or for all of them
"""
# Init the ReposInfo class and check if all files are present
_repos = ReposInfo()
# recoursively walk the mirrors
# and gather all of the repos for 'tag' or all of the tags
_repos.fetch_repos(url, tag=tag)
def action_for_tag(
self,
url,
tag,
action=None,
descriptions=None
):
"""Executes action for every tag from all collections
"""
if not action:
logger_cli.info("# No action set, nothing to do")
# get all tags
major, updates, hotfix = ReposInfo().list_tags()
if action == "list":
logger_cli.info("# Tags available at '{}':".format(url))
for t in major:
logger_cli.info("\t{}".format(t))
for t in updates:
logger_cli.info("\t{} [updates]".format(t))
for t in hotfix:
logger_cli.info("\t{} [hotfix]".format(t))
return
# Pupulate action tags
_action_tags = []
if tag in major:
_action_tags.append(tag)
elif tag in updates:
_action_tags.append(tag + ".update")
elif tag in hotfix:
_action_tags.append(tag + ".hotfix")
if not _action_tags:
logger_cli.info(
"# Tag of '{}' not found. "
"Consider rebuilding repos info.".format(tag)
)
elif action == "build":
logger_cli.info(
"-> tags to build {}".format(", ".join(_action_tags))
)
for t in _action_tags:
logger_cli.info(
"# Building repo info for '{}/{}'".format(
url,
tag
)
)
self.build_repos(url, tag=tag)
elif action == "fetch":
logger_cli.info(
"-> fetching versions for tags {}".format(
", ".join(_action_tags)
)
)
for t in _action_tags:
self.fetch_versions(t, descriptions=descriptions)
logger_cli.info("# Done.")
def parse_repos(self):
# all tags to check
major, updates, hotfix = ReposInfo().list_tags()
# major tags
logger_cli.info("# Processing major tags")
for _tag in major:
self.fetch_versions(_tag)
# updates tags
logger_cli.info("# Processing update tags")
for _tag in updates:
self.fetch_versions(_tag + ".update")
# hotfix tags
logger_cli.info("# Processing hotfix tags")
for _tag in hotfix:
self.fetch_versions(_tag + ".hotfix")