blob: 2a91ed5c47c05aee151a230fb739a5806ea9d10b [file] [log] [blame]
Alexd9fd85e2019-05-16 16:58:24 -05001import json
2import os
3from copy import deepcopy
4
5from cfg_checker.common import logger, logger_cli
6from cfg_checker.common.const import _pkg_desc_archive
7from cfg_checker.common.const import _repos_index_filename
8from cfg_checker.common.const import _repos_info_archive
9from cfg_checker.common.const import _repos_versions_archive
10from cfg_checker.common.const import _repos_versions_filename
11from cfg_checker.common.const import ubuntu_releases
12from cfg_checker.common.file_utils import get_gzipped_file
13from cfg_checker.common.settings import pkg_dir
14from cfg_checker.helpers.console_utils import Progress
15from cfg_checker.helpers.tgz import TGZFile
16
17import requests
18from requests.exceptions import ConnectionError
19
20ext = ".json"
21
22
23def _n_url(url):
24 if url[-1] == '/':
25 return url
26 else:
27 return url + '/'
28
29
30class ReposInfo(object):
31 repos = []
32 _repofile = os.path.join(pkg_dir, "versions", _repos_info_archive)
33
34 @staticmethod
35 def _ls_repo_page(url):
36 # Yes, this is ugly. But it works ok for small HTMLs.
37 _a = "<a"
38 _s = "href="
39 _e = "\">"
40 try:
41 page = requests.get(url, timeout=60)
42 except ConnectionError as e:
43 logger_cli.error("# ERROR: {}".format(e.message))
44 return [], []
45 a = page.text.splitlines()
46 # Comprehension for dirs. Anchors for ends with '-'
47 _dirs = [l[l.index(_s)+6:l.index(_e)-1]
48 for l in a if l.startswith(_a) and l.endswith('-')]
49 # Comprehension for files. Anchors ends with size
50 _files = [l[l.index(_s)+6:l.index(_e)]
51 for l in a if l.startswith(_a) and not l.endswith('-')]
52
53 return _dirs, _files
54
55 def search_pkg(self, url, _list):
56 # recoursive method to walk dists tree
57 _dirs, _files = self._ls_repo_page(url)
58
59 for _d in _dirs:
60 # Search only in dists, ignore the rest
61 if "dists" not in url and _d != "dists":
62 continue
63 _u = _n_url(url + _d)
64 self.search_pkg(_u, _list)
65
66 for _f in _files:
67 if _f == "Packages.gz":
68 _list.append(url + _f)
69 logger.debug("... [F] '{}'".format(url + _f))
70
71 return _list
72
73 @staticmethod
74 def _map_repo(_path_list, _r):
75 for _pkg_path in _path_list:
76 _l = _pkg_path.split('/')
77 _kw = _l[_l.index('dists')+1:]
78 _kw.reverse()
79 _repo_item = {
80 "arch": _kw[1][7:] if "binary" in _kw[1] else _kw[1],
81 "type": _kw[2],
82 "ubuntu-release": _kw[3],
83 "filepath": _pkg_path
84 }
85 _r.append(_repo_item)
86
87 def _find_tag(self, _t, _u, label=""):
88 if label:
89 _url = _n_url(_u + label)
90 _label = _t + '.' + label
91 else:
92 _url = _u
93 _label = _t
94 _ts, _ = self._ls_repo_page(_url)
95 if _t in _ts:
96 logger.debug(
97 "... found tag '{}' at '{}'".format(
98 _t,
99 _url
100 )
101 )
102 return {
103 _label: {
104 "baseurl": _n_url(_url + _t),
105 "all": {}
106 }
107 }
108 else:
109 return {}
110
111 def fetch_repos(self, url, tag=None):
112 base_url = _n_url(url)
113 logger_cli.info("# Using '{}' as a repos source".format(base_url))
114
115 logger_cli.info("# Gathering repos info (i.e. links to 'packages.gz')")
116 # init repoinfo archive
117 _repotgz = TGZFile(self._repofile)
118 # prepare repo links
119 _repos = {}
120 if tag:
121 # only one tag to process
122 _repos.update(self._find_tag(tag, base_url))
123 _repos.update(self._find_tag(tag, base_url, label="hotfix"))
124 _repos.update(self._find_tag(tag, base_url, label="update"))
125 else:
126 # gather all of them
127 _tags, _ = self._ls_repo_page(base_url)
128 _tags.remove('hotfix')
129 _tags.remove('update')
130 # search tags in subfolders
131 _h_tags, _ = self._ls_repo_page(base_url + 'hotfix')
132 _u_tags, _ = self._ls_repo_page(base_url + 'update')
133 _tags.extend([t for t in _h_tags if t not in _tags])
134 _tags.extend([t for t in _u_tags if t not in _tags])
135 _progress = Progress(len(_tags))
136 _index = 0
137 for _tag in _tags:
138 _repos.update(self._find_tag(_tag, base_url))
139 _repos.update(self._find_tag(_tag, base_url, label="hotfix"))
140 _repos.update(self._find_tag(_tag, base_url, label="update"))
141 _index += 1
142 _progress.write_progress(_index)
143 _progress.end()
144
145 # parse subtags
146 for _label in _repos.keys():
147 logger_cli.info("-> processing tag '{}'".format(_label))
148 _name = _label + ".json"
149 if _repotgz.has_file(_name):
150 logger_cli.info(
151 "-> skipping, '{}' already has '{}'".format(
152 _repos_info_archive,
153 _name
154 )
155 )
156 continue
157 # process the tag
158 _repo = _repos[_label]
159 _baseurl = _repos[_label]["baseurl"]
160 # get the subtags
161 _sub_tags, _ = self._ls_repo_page(_baseurl)
162 _total_index = len(_sub_tags)
163 _index = 0
164 _progress = Progress(_total_index)
165 logger.debug(
166 "... found {} subtags for '{}'".format(
167 len(_sub_tags),
168 _label
169 )
170 )
171 # save the url and start search
172 for _stag in _sub_tags:
173 _u = _baseurl + _stag
174 _index += 1
175 logger.debug(
176 "... searching repos in '{}/{}'".format(
177 _label,
178 _stag
179 )
180 )
181
182 # Searching Package collections
183 if _stag in ubuntu_releases:
184 # if stag is the release, this is all packages
185 _repo["all"][_stag] = []
186 _repo["all"]["url"] = _n_url(_u)
187 _path_list = self.search_pkg(_n_url(_u), [])
188 self._map_repo(_path_list, _repo["all"][_stag])
189 logger.info(
190 "-> found {} dists".format(
191 len(_repo["all"][_stag])
192 )
193 )
194
195 else:
196 # each subtag might have any ubuntu release
197 # so iterate them
198 _repo[_stag] = {
199 "url": _n_url(_u)
200 }
201 _releases, _ = self._ls_repo_page(_n_url(_u))
202 for _rel in _releases:
203 if _rel not in ubuntu_releases:
204 logger.debug(
205 "... skipped unknown ubuntu release: "
206 "'{}' in '{}'".format(
207 _rel,
208 _u
209 )
210 )
211 else:
212 _rel_u = _n_url(_u) + _rel
213 _repo[_stag][_rel] = []
214 _path_list = self.search_pkg(_n_url(_rel_u), [])
215 self._map_repo(
216 _path_list,
217 _repo[_stag][_rel]
218 )
219 logger.info(
220 "-> found {} dists for '{}'".format(
221 len(_repo[_stag][_rel]),
222 _rel
223 )
224 )
225 _progress.write_progress(_index)
226
227 _progress.end()
228 _name = _label + ext
229 _repotgz.add_file(_name, buf=json.dumps(_repo, indent=2))
230 logger_cli.info(
231 "-> archive '{}' updated with '{}'".format(
232 self._repofile,
233 _name
234 )
235 )
236
237 return
238
239 def list_tags(self):
240 _files = TGZFile(self._repofile).list_files()
241 # all files in archive with no '.json' part
242 _all = set([f.rsplit('.', 1)[0] for f in _files])
243 # files that ends with '.update'
244 _updates = set([f for f in _all if f.find('update') >= 0])
245 # files that ends with '.hotfix'
246 _hotfix = set([f for f in _all if f.find('hotfix') >= 0])
247 # remove updates and hotfix tags from all. The true magic of SETs
248 _all = _all - _updates - _hotfix
249 # cut updates and hotfix endings
250 _updates = [f.rsplit('.', 1)[0] for f in _updates]
251 _hotfix = [f.rsplit('.', 1)[0] for f in _hotfix]
252
253 return _all, _updates, _hotfix
254
255 def get_repoinfo(self, tag):
256 _tgz = TGZFile(self._repofile)
257 _buf = _tgz.get_file(tag + ext)
258 return json.loads(_buf)
259
260
261class RepoManager(object):
262 # files in archive
263 _repoindexfile = _repos_index_filename
264 _versionsfile = _repos_versions_filename
265 # archives
266 _versions_arch = os.path.join(pkg_dir, "versions", _repos_versions_archive)
267 _desc_arch = os.path.join(pkg_dir, "versions", _pkg_desc_archive)
268
269 # repository index
270 _repo_index = {}
271
272 # init package versions storage
273 _versions = {}
274
275 def __init__(self):
276 # Init version files
277 self.versionstgz = TGZFile(
278 self._versions_arch,
279 label="MCP Configuration Checker: Package versions archive"
280 )
281 self.desctgz = TGZFile(
282 self._desc_arch,
283 label="MCP Configuration Checker: Package descriptions archive"
284 )
285
286 if self._versionsfile in self.versionstgz.list_files():
287 logger_cli.info(
288 "# Loading versions '{}':'{}'".format(
289 self._versions_arch,
290 self._versionsfile
291 )
292 )
293 self._versions = json.loads(
294 self.versionstgz.get_file(self._versionsfile)
295 )
296
297 if self._repoindexfile in self.versionstgz.list_files():
298 self._repo_index = json.loads(
299 self.versionstgz.get_file(
300 self._repoindexfile
301 )
302 )
303
304 def _create_repo_header(self, p):
305 _header = "_".join([
306 p['tag'],
307 p['subset'],
308 p['release'],
309 p['ubuntu-release'],
310 p['type'],
311 p['arch']
312 ])
313 if not filter(
314 lambda i: self._repo_index[i]["header"] == _header,
315 self._repo_index
316 ):
317 _index = str(len(self._repo_index.keys()) + 1)
318 self._repo_index[_index] = {
319 "header": _header,
320 "props": p
321 }
322 else:
323 for _k, _v in self._repo_index.iteritems():
324 if _v["header"] == _header:
325 _index = _k
326
327 return _index
328
329 def _get_repo_header(self, index):
330 return self._repo_index[index]
331
332 def _update_pkg_version(self, n, v, md5, header_index):
333 """Method updates package version record in global dict
334 """
335 # 'if'*4 operation is pretty expensive when using it 100k in a row
336 # so try/except is a better way to go, even faster than 'reduce'
337 vs = self._versions
338 try:
339 # try to load list
340 _list = vs[n][v][md5]
341 # cast it as set() and union()
342 _list = set(_list).union([header_index])
343 # cast back as set() is not serializeable
344 vs[n][v][md5] = list(_list)
345 return False
346 except KeyError:
347 # ok, this is fresh pkg. Do it slow way.
348 if n in vs:
349 # there is such pkg already
350 if v in vs[n]:
351 # there is such version, check md5
352 if md5 in vs[n][v]:
353 # just add new repo header
354 if header_index not in vs[n][v][md5]:
355 vs[n][v][md5].append(header_index)
356 else:
357 # check if such index is here...
358 _existing = filter(
359 lambda i: header_index in vs[n][v][i],
360 vs[n][v]
361 )
362 if _existing:
363 # Yuck! Same version had different MD5
364 logger_cli.error(
365 "# ERROR: Package version has multiple MD5s "
366 "in '{}': {}:{}:{}".format(
367 self._get_repo_header(
368 header_index
369 )["header"],
370 n,
371 v,
372 md5
373 )
374 )
375 vs[n][v][md5] = [header_index]
376 else:
377 # this is new version for existing package
378 vs[n][v] = {
379 md5: [header_index]
380 }
381 return False
382 else:
383 # this is new pakcage
384 vs[n] = {
385 v: {
386 md5: [header_index]
387 }
388 }
389 return True
390
391 def _save_repo_descriptions(self, repo_props, desc):
392 # form the filename for the repo and save it
393 self.desctgz.add_file(
394 self._create_repo_header(repo_props),
395 json.dumps(desc)
396 )
397
398 # def get_description(self, repo_props, name, md5=None):
399 # """Gets target description
400 # """
401 # _filename = self._create_repo_header(repo_props)
402 # # check if it is present in cache
403 # if _filename in self._desc_cache:
404 # _descs = self._desc_cache[_filename]
405 # else:
406 # # load data
407 # _descs = self.desctgz.get_file(_filename)
408 # # Serialize it
409 # _descs = json.loads(_descs)
410 # self._desc_cache[_filename] = _descs
411 # # return target desc
412 # if name in _descs and md5 in _descs[name]:
413 # return _descs[name][md5]
414 # else:
415 # return None
416
417 def parse_tag(self, tag, descriptions=False):
418 """Download and parse Package.gz files for specific tag
419 By default, descriptions not saved
420 due to huge resulting file size and slow processing
421 """
422 # init gzip and downloader
423 _info = ReposInfo().get_repoinfo(tag)
424 # calculate Packages.gz files to process
425 _baseurl = _info.pop("baseurl")
426 _total_components = len(_info.keys()) - 1
427 _ubuntu_package_repos = 0
428 _other_repos = 0
429 for _c, _d in _info.iteritems():
430 for _ur, _l in _d.iteritems():
431 if _ur in ubuntu_releases:
432 _ubuntu_package_repos += len(_l)
433 elif _ur != 'url':
434 _other_repos += len(_l)
435 logger_cli.info(
436 "-> loaded repository info for '{}'.\n"
437 " '{}', {} components, {} ubuntu repos, {} other/uknown".format(
438 _baseurl,
439 tag,
440 _total_components,
441 _ubuntu_package_repos,
442 _other_repos
443 )
444 )
445 # init progress bar
446 _progress = Progress(_ubuntu_package_repos)
447 _index = 0
448 _processed = 0
449 _new = 0
450 for _c, _d in _info.iteritems():
451 # we do not need url here, just get rid of it
452 if 'url' in _d:
453 _d.pop('url')
454 # _url = if 'url' in _d else _baseurl + _c
455 for _ur, _l in _d.iteritems():
456 # iterate package collections
457 for _p in _l:
458 # descriptions
459 if descriptions:
460 _descriptions = {}
461 # download and unzip
462 _progress.write_progress(
463 _index,
464 note="/ {} {} {} {} {}, {}/{}".format(
465 _c,
466 _ur,
467 _p['ubuntu-release'],
468 _p['type'],
469 _p['arch'],
470 _processed,
471 _new
472 )
473 )
474 _raw = get_gzipped_file(_p['filepath'])
475 _lines = _raw.splitlines()
476 _index += 1
477 # break lines collection into isolated pkg data
478 _pkg = {
479 "tag": tag,
480 "subset": _c,
481 "release": _ur
482 }
483 _pkg.update(_p)
484 _desc = {}
485 _key = _value = ""
486 for _line in _lines:
487 if not _line:
488 # if the line is empty, process pkg data gathered
489 _name = _desc['package']
490 _md5 = _desc['md5sum']
491 _version = _desc['version']
492 _pkg['md5'] = _md5
493 # update version for a package
494 if self._update_pkg_version(
495 _name,
496 _version,
497 _md5,
498 self._create_repo_header(_pkg)
499 ):
500 _new += 1
501
502 if descriptions:
503 _d_new = {
504 _md5: deepcopy(_desc)
505 }
506 try:
507 _descriptions[_name].update(_d_new)
508 except KeyError:
509 _descriptions[_name] = _d_new
510 # clear the data for next pkg
511 _processed += 1
512 _desc = {}
513 _key = ""
514 _value = ""
515 elif _line.startswith(' '):
516 _desc[_key] += "\n{}".format(_line)
517 else:
518 _key, _value = _line.split(': ', 1)
519 _key = _key.lower()
520
521 _desc[_key] = _value
522 # save descriptions if needed
523 if descriptions:
524 _progress.clearline()
525 self._save_repo_descriptions(_pkg, _descriptions)
526
527 _progress.end()
528 # backup headers to disk
529 self.versionstgz.add_file(
530 self._repoindexfile,
531 json.dumps(self._repo_index),
532 replace=True
533 )
534 return
535
536 def fetch_versions(self, tag, descriptions=False):
537 """Executes parsing for specific tag
538 """
539 if descriptions:
540 logger_cli.warning(
541 "\n\n# !!! WARNING: Saving repo descriptions "
542 "consumes huge amount of disk space\n\n"
543 )
544 # if there is no such tag, parse it from repoinfo
545 _f = self._versionsfile
546 logger_cli.info("# Fetching versions for {}".format(tag))
547 self.parse_tag(tag, descriptions=descriptions)
548 logger_cli.info("-> saving updated versions to {}".format(_f))
549 self.versionstgz.add_file(_f, json.dumps(self._versions), replace=True)
550
551 def build_repos(self, url, tag=None):
552 """Builds versions data for selected tag, or for all of them
553 """
554 # Init the ReposInfo class and check if all files are present
555 _repos = ReposInfo()
556 # recoursively walk the mirrors
557 # and gather all of the repos for 'tag' or all of the tags
558 _repos.fetch_repos(url, tag=tag)
559
560 def action_for_tag(
561 self,
562 url,
563 tag,
564 action=None,
565 descriptions=None
566 ):
567 """Executes action for every tag from all collections
568 """
569 if not action:
570 logger_cli.info("# No action set, nothing to do")
571 # get all tags
572 major, updates, hotfix = ReposInfo().list_tags()
573 if action == "list":
574 logger_cli.info("# Tags available at '{}':".format(url))
575 for t in major:
576 logger_cli.info("\t{}".format(t))
577 for t in updates:
578 logger_cli.info("\t{} [updates]".format(t))
579 for t in hotfix:
580 logger_cli.info("\t{} [hotfix]".format(t))
581 return
582 # Pupulate action tags
583 _action_tags = []
584 if tag in major:
585 _action_tags.append(tag)
586 elif tag in updates:
587 _action_tags.append(tag + ".update")
588 elif tag in hotfix:
589 _action_tags.append(tag + ".hotfix")
590
591 if not _action_tags:
592 logger_cli.info(
593 "# Tag of '{}' not found. "
594 "Consider rebuilding repos info.".format(tag)
595 )
596 elif action == "build":
597 logger_cli.info(
598 "-> tags to build {}".format(", ".join(_action_tags))
599 )
600 for t in _action_tags:
601 logger_cli.info(
602 "# Building repo info for '{}/{}'".format(
603 url,
604 tag
605 )
606 )
607 self.build_repos(url, tag=tag)
608 elif action == "fetch":
609 logger_cli.info(
610 "-> fetching versions for tags {}".format(
611 ", ".join(_action_tags)
612 )
613 )
614 for t in _action_tags:
615 self.fetch_versions(t, descriptions=descriptions)
616
617 logger_cli.info("# Done.")
618
619 def parse_repos(self):
620 # all tags to check
621 major, updates, hotfix = ReposInfo().list_tags()
622
623 # major tags
624 logger_cli.info("# Processing major tags")
625 for _tag in major:
626 self.fetch_versions(_tag)
627
628 # updates tags
629 logger_cli.info("# Processing update tags")
630 for _tag in updates:
631 self.fetch_versions(_tag + ".update")
632
633 # hotfix tags
634 logger_cli.info("# Processing hotfix tags")
635 for _tag in hotfix:
636 self.fetch_versions(_tag + ".hotfix")