Blame - cfg_checker/modules/packages/repos.py - mcp/cfg-checker

blob: 2a91ed5c47c05aee151a230fb739a5806ea9d10b [file] [log] [blame]

Alex	d9fd85e	2019-05-16 16:58:24 -0500	[diff] [blame^]	1	import json
				2	import os
				3	from copy import deepcopy
				4
				5	from cfg_checker.common import logger, logger_cli
				6	from cfg_checker.common.const import _pkg_desc_archive
				7	from cfg_checker.common.const import _repos_index_filename
				8	from cfg_checker.common.const import _repos_info_archive
				9	from cfg_checker.common.const import _repos_versions_archive
				10	from cfg_checker.common.const import _repos_versions_filename
				11	from cfg_checker.common.const import ubuntu_releases
				12	from cfg_checker.common.file_utils import get_gzipped_file
				13	from cfg_checker.common.settings import pkg_dir
				14	from cfg_checker.helpers.console_utils import Progress
				15	from cfg_checker.helpers.tgz import TGZFile
				16
				17	import requests
				18	from requests.exceptions import ConnectionError
				19
				20	ext = ".json"
				21
				22
				23	def _n_url(url):
				24	if url[-1] == '/':
				25	return url
				26	else:
				27	return url + '/'
				28
				29
				30	class ReposInfo(object):
				31	repos = []
				32	_repofile = os.path.join(pkg_dir, "versions", _repos_info_archive)
				33
				34	@staticmethod
				35	def _ls_repo_page(url):
				36	# Yes, this is ugly. But it works ok for small HTMLs.
				37	_a = "<a"
				38	_s = "href="
				39	_e = "\">"
				40	try:
				41	page = requests.get(url, timeout=60)
				42	except ConnectionError as e:
				43	logger_cli.error("# ERROR: {}".format(e.message))
				44	return [], []
				45	a = page.text.splitlines()
				46	# Comprehension for dirs. Anchors for ends with '-'
				47	_dirs = [l[l.index(_s)+6:l.index(_e)-1]
				48	for l in a if l.startswith(_a) and l.endswith('-')]
				49	# Comprehension for files. Anchors ends with size
				50	_files = [l[l.index(_s)+6:l.index(_e)]
				51	for l in a if l.startswith(_a) and not l.endswith('-')]
				52
				53	return _dirs, _files
				54
				55	def search_pkg(self, url, _list):
				56	# recoursive method to walk dists tree
				57	_dirs, _files = self._ls_repo_page(url)
				58
				59	for _d in _dirs:
				60	# Search only in dists, ignore the rest
				61	if "dists" not in url and _d != "dists":
				62	continue
				63	_u = _n_url(url + _d)
				64	self.search_pkg(_u, _list)
				65
				66	for _f in _files:
				67	if _f == "Packages.gz":
				68	_list.append(url + _f)
				69	logger.debug("... [F] '{}'".format(url + _f))
				70
				71	return _list
				72
				73	@staticmethod
				74	def _map_repo(_path_list, _r):
				75	for _pkg_path in _path_list:
				76	_l = _pkg_path.split('/')
				77	_kw = _l[_l.index('dists')+1:]
				78	_kw.reverse()
				79	_repo_item = {
				80	"arch": _kw[1][7:] if "binary" in _kw[1] else _kw[1],
				81	"type": _kw[2],
				82	"ubuntu-release": _kw[3],
				83	"filepath": _pkg_path
				84	}
				85	_r.append(_repo_item)
				86
				87	def _find_tag(self, _t, _u, label=""):
				88	if label:
				89	_url = _n_url(_u + label)
				90	_label = _t + '.' + label
				91	else:
				92	_url = _u
				93	_label = _t
				94	_ts, _ = self._ls_repo_page(_url)
				95	if _t in _ts:
				96	logger.debug(
				97	"... found tag '{}' at '{}'".format(
				98	_t,
				99	_url
				100	)
				101	)
				102	return {
				103	_label: {
				104	"baseurl": _n_url(_url + _t),
				105	"all": {}
				106	}
				107	}
				108	else:
				109	return {}
				110
				111	def fetch_repos(self, url, tag=None):
				112	base_url = _n_url(url)
				113	logger_cli.info("# Using '{}' as a repos source".format(base_url))
				114
				115	logger_cli.info("# Gathering repos info (i.e. links to 'packages.gz')")
				116	# init repoinfo archive
				117	_repotgz = TGZFile(self._repofile)
				118	# prepare repo links
				119	_repos = {}
				120	if tag:
				121	# only one tag to process
				122	_repos.update(self._find_tag(tag, base_url))
				123	_repos.update(self._find_tag(tag, base_url, label="hotfix"))
				124	_repos.update(self._find_tag(tag, base_url, label="update"))
				125	else:
				126	# gather all of them
				127	_tags, _ = self._ls_repo_page(base_url)
				128	_tags.remove('hotfix')
				129	_tags.remove('update')
				130	# search tags in subfolders
				131	_h_tags, _ = self._ls_repo_page(base_url + 'hotfix')
				132	_u_tags, _ = self._ls_repo_page(base_url + 'update')
				133	_tags.extend([t for t in _h_tags if t not in _tags])
				134	_tags.extend([t for t in _u_tags if t not in _tags])
				135	_progress = Progress(len(_tags))
				136	_index = 0
				137	for _tag in _tags:
				138	_repos.update(self._find_tag(_tag, base_url))
				139	_repos.update(self._find_tag(_tag, base_url, label="hotfix"))
				140	_repos.update(self._find_tag(_tag, base_url, label="update"))
				141	_index += 1
				142	_progress.write_progress(_index)
				143	_progress.end()
				144
				145	# parse subtags
				146	for _label in _repos.keys():
				147	logger_cli.info("-> processing tag '{}'".format(_label))
				148	_name = _label + ".json"
				149	if _repotgz.has_file(_name):
				150	logger_cli.info(
				151	"-> skipping, '{}' already has '{}'".format(
				152	_repos_info_archive,
				153	_name
				154	)
				155	)
				156	continue
				157	# process the tag
				158	_repo = _repos[_label]
				159	_baseurl = _repos[_label]["baseurl"]
				160	# get the subtags
				161	_sub_tags, _ = self._ls_repo_page(_baseurl)
				162	_total_index = len(_sub_tags)
				163	_index = 0
				164	_progress = Progress(_total_index)
				165	logger.debug(
				166	"... found {} subtags for '{}'".format(
				167	len(_sub_tags),
				168	_label
				169	)
				170	)
				171	# save the url and start search
				172	for _stag in _sub_tags:
				173	_u = _baseurl + _stag
				174	_index += 1
				175	logger.debug(
				176	"... searching repos in '{}/{}'".format(
				177	_label,
				178	_stag
				179	)
				180	)
				181
				182	# Searching Package collections
				183	if _stag in ubuntu_releases:
				184	# if stag is the release, this is all packages
				185	_repo["all"][_stag] = []
				186	_repo["all"]["url"] = _n_url(_u)
				187	_path_list = self.search_pkg(_n_url(_u), [])
				188	self._map_repo(_path_list, _repo["all"][_stag])
				189	logger.info(
				190	"-> found {} dists".format(
				191	len(_repo["all"][_stag])
				192	)
				193	)
				194
				195	else:
				196	# each subtag might have any ubuntu release
				197	# so iterate them
				198	_repo[_stag] = {
				199	"url": _n_url(_u)
				200	}
				201	_releases, _ = self._ls_repo_page(_n_url(_u))
				202	for _rel in _releases:
				203	if _rel not in ubuntu_releases:
				204	logger.debug(
				205	"... skipped unknown ubuntu release: "
				206	"'{}' in '{}'".format(
				207	_rel,
				208	_u
				209	)
				210	)
				211	else:
				212	_rel_u = _n_url(_u) + _rel
				213	_repo[_stag][_rel] = []
				214	_path_list = self.search_pkg(_n_url(_rel_u), [])
				215	self._map_repo(
				216	_path_list,
				217	_repo[_stag][_rel]
				218	)
				219	logger.info(
				220	"-> found {} dists for '{}'".format(
				221	len(_repo[_stag][_rel]),
				222	_rel
				223	)
				224	)
				225	_progress.write_progress(_index)
				226
				227	_progress.end()
				228	_name = _label + ext
				229	_repotgz.add_file(_name, buf=json.dumps(_repo, indent=2))
				230	logger_cli.info(
				231	"-> archive '{}' updated with '{}'".format(
				232	self._repofile,
				233	_name
				234	)
				235	)
				236
				237	return
				238
				239	def list_tags(self):
				240	_files = TGZFile(self._repofile).list_files()
				241	# all files in archive with no '.json' part
				242	_all = set([f.rsplit('.', 1)[0] for f in _files])
				243	# files that ends with '.update'
				244	_updates = set([f for f in _all if f.find('update') >= 0])
				245	# files that ends with '.hotfix'
				246	_hotfix = set([f for f in _all if f.find('hotfix') >= 0])
				247	# remove updates and hotfix tags from all. The true magic of SETs
				248	_all = _all - _updates - _hotfix
				249	# cut updates and hotfix endings
				250	_updates = [f.rsplit('.', 1)[0] for f in _updates]
				251	_hotfix = [f.rsplit('.', 1)[0] for f in _hotfix]
				252
				253	return _all, _updates, _hotfix
				254
				255	def get_repoinfo(self, tag):
				256	_tgz = TGZFile(self._repofile)
				257	_buf = _tgz.get_file(tag + ext)
				258	return json.loads(_buf)
				259
				260
				261	class RepoManager(object):
				262	# files in archive
				263	_repoindexfile = _repos_index_filename
				264	_versionsfile = _repos_versions_filename
				265	# archives
				266	_versions_arch = os.path.join(pkg_dir, "versions", _repos_versions_archive)
				267	_desc_arch = os.path.join(pkg_dir, "versions", _pkg_desc_archive)
				268
				269	# repository index
				270	_repo_index = {}
				271
				272	# init package versions storage
				273	_versions = {}
				274
				275	def __init__(self):
				276	# Init version files
				277	self.versionstgz = TGZFile(
				278	self._versions_arch,
				279	label="MCP Configuration Checker: Package versions archive"
				280	)
				281	self.desctgz = TGZFile(
				282	self._desc_arch,
				283	label="MCP Configuration Checker: Package descriptions archive"
				284	)
				285
				286	if self._versionsfile in self.versionstgz.list_files():
				287	logger_cli.info(
				288	"# Loading versions '{}':'{}'".format(
				289	self._versions_arch,
				290	self._versionsfile
				291	)
				292	)
				293	self._versions = json.loads(
				294	self.versionstgz.get_file(self._versionsfile)
				295	)
				296
				297	if self._repoindexfile in self.versionstgz.list_files():
				298	self._repo_index = json.loads(
				299	self.versionstgz.get_file(
				300	self._repoindexfile
				301	)
				302	)
				303
				304	def _create_repo_header(self, p):
				305	_header = "_".join([
				306	p['tag'],
				307	p['subset'],
				308	p['release'],
				309	p['ubuntu-release'],
				310	p['type'],
				311	p['arch']
				312	])
				313	if not filter(
				314	lambda i: self._repo_index[i]["header"] == _header,
				315	self._repo_index
				316	):
				317	_index = str(len(self._repo_index.keys()) + 1)
				318	self._repo_index[_index] = {
				319	"header": _header,
				320	"props": p
				321	}
				322	else:
				323	for _k, _v in self._repo_index.iteritems():
				324	if _v["header"] == _header:
				325	_index = _k
				326
				327	return _index
				328
				329	def _get_repo_header(self, index):
				330	return self._repo_index[index]
				331
				332	def _update_pkg_version(self, n, v, md5, header_index):
				333	"""Method updates package version record in global dict
				334	"""
				335	# 'if'*4 operation is pretty expensive when using it 100k in a row
				336	# so try/except is a better way to go, even faster than 'reduce'
				337	vs = self._versions
				338	try:
				339	# try to load list
				340	_list = vs[n][v][md5]
				341	# cast it as set() and union()
				342	_list = set(_list).union([header_index])
				343	# cast back as set() is not serializeable
				344	vs[n][v][md5] = list(_list)
				345	return False
				346	except KeyError:
				347	# ok, this is fresh pkg. Do it slow way.
				348	if n in vs:
				349	# there is such pkg already
				350	if v in vs[n]:
				351	# there is such version, check md5
				352	if md5 in vs[n][v]:
				353	# just add new repo header
				354	if header_index not in vs[n][v][md5]:
				355	vs[n][v][md5].append(header_index)
				356	else:
				357	# check if such index is here...
				358	_existing = filter(
				359	lambda i: header_index in vs[n][v][i],
				360	vs[n][v]
				361	)
				362	if _existing:
				363	# Yuck! Same version had different MD5
				364	logger_cli.error(
				365	"# ERROR: Package version has multiple MD5s "
				366	"in '{}': {}:{}:{}".format(
				367	self._get_repo_header(
				368	header_index
				369	)["header"],
				370	n,
				371	v,
				372	md5
				373	)
				374	)
				375	vs[n][v][md5] = [header_index]
				376	else:
				377	# this is new version for existing package
				378	vs[n][v] = {
				379	md5: [header_index]
				380	}
				381	return False
				382	else:
				383	# this is new pakcage
				384	vs[n] = {
				385	v: {
				386	md5: [header_index]
				387	}
				388	}
				389	return True
				390
				391	def _save_repo_descriptions(self, repo_props, desc):
				392	# form the filename for the repo and save it
				393	self.desctgz.add_file(
				394	self._create_repo_header(repo_props),
				395	json.dumps(desc)
				396	)
				397
				398	# def get_description(self, repo_props, name, md5=None):
				399	# """Gets target description
				400	# """
				401	# _filename = self._create_repo_header(repo_props)
				402	# # check if it is present in cache
				403	# if _filename in self._desc_cache:
				404	# _descs = self._desc_cache[_filename]
				405	# else:
				406	# # load data
				407	# _descs = self.desctgz.get_file(_filename)
				408	# # Serialize it
				409	# _descs = json.loads(_descs)
				410	# self._desc_cache[_filename] = _descs
				411	# # return target desc
				412	# if name in _descs and md5 in _descs[name]:
				413	# return _descs[name][md5]
				414	# else:
				415	# return None
				416
				417	def parse_tag(self, tag, descriptions=False):
				418	"""Download and parse Package.gz files for specific tag
				419	By default, descriptions not saved
				420	due to huge resulting file size and slow processing
				421	"""
				422	# init gzip and downloader
				423	_info = ReposInfo().get_repoinfo(tag)
				424	# calculate Packages.gz files to process
				425	_baseurl = _info.pop("baseurl")
				426	_total_components = len(_info.keys()) - 1
				427	_ubuntu_package_repos = 0
				428	_other_repos = 0
				429	for _c, _d in _info.iteritems():
				430	for _ur, _l in _d.iteritems():
				431	if _ur in ubuntu_releases:
				432	_ubuntu_package_repos += len(_l)
				433	elif _ur != 'url':
				434	_other_repos += len(_l)
				435	logger_cli.info(
				436	"-> loaded repository info for '{}'.\n"
				437	" '{}', {} components, {} ubuntu repos, {} other/uknown".format(
				438	_baseurl,
				439	tag,
				440	_total_components,
				441	_ubuntu_package_repos,
				442	_other_repos
				443	)
				444	)
				445	# init progress bar
				446	_progress = Progress(_ubuntu_package_repos)
				447	_index = 0
				448	_processed = 0
				449	_new = 0
				450	for _c, _d in _info.iteritems():
				451	# we do not need url here, just get rid of it
				452	if 'url' in _d:
				453	_d.pop('url')
				454	# _url = if 'url' in _d else _baseurl + _c
				455	for _ur, _l in _d.iteritems():
				456	# iterate package collections
				457	for _p in _l:
				458	# descriptions
				459	if descriptions:
				460	_descriptions = {}
				461	# download and unzip
				462	_progress.write_progress(
				463	_index,
				464	note="/ {} {} {} {} {}, {}/{}".format(
				465	_c,
				466	_ur,
				467	_p['ubuntu-release'],
				468	_p['type'],
				469	_p['arch'],
				470	_processed,
				471	_new
				472	)
				473	)
				474	_raw = get_gzipped_file(_p['filepath'])
				475	_lines = _raw.splitlines()
				476	_index += 1
				477	# break lines collection into isolated pkg data
				478	_pkg = {
				479	"tag": tag,
				480	"subset": _c,
				481	"release": _ur
				482	}
				483	_pkg.update(_p)
				484	_desc = {}
				485	_key = _value = ""
				486	for _line in _lines:
				487	if not _line:
				488	# if the line is empty, process pkg data gathered
				489	_name = _desc['package']
				490	_md5 = _desc['md5sum']
				491	_version = _desc['version']
				492	_pkg['md5'] = _md5
				493	# update version for a package
				494	if self._update_pkg_version(
				495	_name,
				496	_version,
				497	_md5,
				498	self._create_repo_header(_pkg)
				499	):
				500	_new += 1
				501
				502	if descriptions:
				503	_d_new = {
				504	_md5: deepcopy(_desc)
				505	}
				506	try:
				507	_descriptions[_name].update(_d_new)
				508	except KeyError:
				509	_descriptions[_name] = _d_new
				510	# clear the data for next pkg
				511	_processed += 1
				512	_desc = {}
				513	_key = ""
				514	_value = ""
				515	elif _line.startswith(' '):
				516	_desc[_key] += "\n{}".format(_line)
				517	else:
				518	_key, _value = _line.split(': ', 1)
				519	_key = _key.lower()
				520
				521	_desc[_key] = _value
				522	# save descriptions if needed
				523	if descriptions:
				524	_progress.clearline()
				525	self._save_repo_descriptions(_pkg, _descriptions)
				526
				527	_progress.end()
				528	# backup headers to disk
				529	self.versionstgz.add_file(
				530	self._repoindexfile,
				531	json.dumps(self._repo_index),
				532	replace=True
				533	)
				534	return
				535
				536	def fetch_versions(self, tag, descriptions=False):
				537	"""Executes parsing for specific tag
				538	"""
				539	if descriptions:
				540	logger_cli.warning(
				541	"\n\n# !!! WARNING: Saving repo descriptions "
				542	"consumes huge amount of disk space\n\n"
				543	)
				544	# if there is no such tag, parse it from repoinfo
				545	_f = self._versionsfile
				546	logger_cli.info("# Fetching versions for {}".format(tag))
				547	self.parse_tag(tag, descriptions=descriptions)
				548	logger_cli.info("-> saving updated versions to {}".format(_f))
				549	self.versionstgz.add_file(_f, json.dumps(self._versions), replace=True)
				550
				551	def build_repos(self, url, tag=None):
				552	"""Builds versions data for selected tag, or for all of them
				553	"""
				554	# Init the ReposInfo class and check if all files are present
				555	_repos = ReposInfo()
				556	# recoursively walk the mirrors
				557	# and gather all of the repos for 'tag' or all of the tags
				558	_repos.fetch_repos(url, tag=tag)
				559
				560	def action_for_tag(
				561	self,
				562	url,
				563	tag,
				564	action=None,
				565	descriptions=None
				566	):
				567	"""Executes action for every tag from all collections
				568	"""
				569	if not action:
				570	logger_cli.info("# No action set, nothing to do")
				571	# get all tags
				572	major, updates, hotfix = ReposInfo().list_tags()
				573	if action == "list":
				574	logger_cli.info("# Tags available at '{}':".format(url))
				575	for t in major:
				576	logger_cli.info("\t{}".format(t))
				577	for t in updates:
				578	logger_cli.info("\t{} [updates]".format(t))
				579	for t in hotfix:
				580	logger_cli.info("\t{} [hotfix]".format(t))
				581	return
				582	# Pupulate action tags
				583	_action_tags = []
				584	if tag in major:
				585	_action_tags.append(tag)
				586	elif tag in updates:
				587	_action_tags.append(tag + ".update")
				588	elif tag in hotfix:
				589	_action_tags.append(tag + ".hotfix")
				590
				591	if not _action_tags:
				592	logger_cli.info(
				593	"# Tag of '{}' not found. "
				594	"Consider rebuilding repos info.".format(tag)
				595	)
				596	elif action == "build":
				597	logger_cli.info(
				598	"-> tags to build {}".format(", ".join(_action_tags))
				599	)
				600	for t in _action_tags:
				601	logger_cli.info(
				602	"# Building repo info for '{}/{}'".format(
				603	url,
				604	tag
				605	)
				606	)
				607	self.build_repos(url, tag=tag)
				608	elif action == "fetch":
				609	logger_cli.info(
				610	"-> fetching versions for tags {}".format(
				611	", ".join(_action_tags)
				612	)
				613	)
				614	for t in _action_tags:
				615	self.fetch_versions(t, descriptions=descriptions)
				616
				617	logger_cli.info("# Done.")
				618
				619	def parse_repos(self):
				620	# all tags to check
				621	major, updates, hotfix = ReposInfo().list_tags()
				622
				623	# major tags
				624	logger_cli.info("# Processing major tags")
				625	for _tag in major:
				626	self.fetch_versions(_tag)
				627
				628	# updates tags
				629	logger_cli.info("# Processing update tags")
				630	for _tag in updates:
				631	self.fetch_versions(_tag + ".update")
				632
				633	# hotfix tags
				634	logger_cli.info("# Processing hotfix tags")
				635	for _tag in hotfix:
				636	self.fetch_versions(_tag + ".hotfix")