Support building multiple pypi mirrors.

Use atomic renames when creating the mirror to avoid races.

Change-Id: Id3a75832d2a71a71253470ffae7499a3946e8ee6
Reviewed-on: https://review.openstack.org/23771
Reviewed-by: Jeremy Stanley <fungi@yuggoth.org>
Approved: James E. Blair <corvus@inaugust.com>
Reviewed-by: James E. Blair <corvus@inaugust.com>
Tested-by: Jenkins
diff --git a/jeepyb/cmd/process_cache.py b/jeepyb/cmd/process_cache.py
deleted file mode 100644
index 67e8691..0000000
--- a/jeepyb/cmd/process_cache.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# vim: tabstop=4 shiftwidth=4 softtabstop=4
-
-# Copyright 2012  Hewlett-Packard Development Company, L.P.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import os
-import urllib
-import datetime
-import sys
-import re
-import md5
-
-
-def main():
-    source_cache = sys.argv[1]
-    destination_mirror = sys.argv[2]
-
-    PACKAGE_VERSION_RE = re.compile(r'(.*)-[0-9]')
-    full_html_line = "<a href='{dir}/{name}'>{name}</a><br />\n"
-
-    packages = {}
-    package_count = 0
-
-    for filename in os.listdir(source_cache):
-        if filename.endswith('content-type'):
-            continue
-
-        realname = urllib.unquote(filename)
-        # The ? accounts for sourceforge downloads
-        tarball = os.path.basename(realname).split("?")[0]
-        name_match = PACKAGE_VERSION_RE.search(tarball)
-
-        if name_match is None:
-            continue
-        package_name = name_match.group(1)
-
-        version_list = packages.get(package_name, {})
-        version_list[tarball] = filename
-        packages[package_name] = version_list
-        package_count = package_count + 1
-
-    full_html = open(os.path.join(destination_mirror, "full.html"), 'w')
-    simple_html = open(os.path.join(destination_mirror, "index.html"), 'w')
-
-    header = ("<html><head><title>PyPI Mirror</title></head>"
-              "<body><h1>PyPI Mirror</h1><h2>Last update: %s</h2>\n\n"
-              % datetime.datetime.utcnow().strftime("%c UTC"))
-    full_html.write(header)
-    simple_html.write(header)
-
-    for package_name, versions in packages.items():
-        destination_dir = os.path.join(destination_mirror, package_name)
-        if not os.path.isdir(destination_dir):
-            os.makedirs(destination_dir)
-        safe_dir = urllib.quote(package_name)
-        simple_html.write("<a href='%s'>%s</a><br />\n" % (safe_dir, safe_dir))
-        with open(os.path.join(destination_dir, "index.html"), 'w') as index:
-            index.write("""<html><head>
-      <title>%s &ndash; PyPI Mirror</title>
-    </head><body>\n""" % package_name)
-            for tarball, filename in versions.items():
-                source_path = os.path.join(source_cache, filename)
-                destination_path = os.path.join(destination_dir, tarball)
-                with open(destination_path, 'w') as dest:
-                    src = open(source_path, 'r').read()
-                    md5sum = md5.md5(src).hexdigest()
-                    dest.write(src)
-
-                    safe_name = urllib.quote(tarball)
-
-                    full_html.write(full_html_line.format(dir=safe_dir,
-                                                          name=safe_name))
-                    index.write("<a href='%s#md5=%s'>%s</a>\n" % (safe_name,
-                                                                  md5sum,
-                                                                  safe_name))
-            index.write("</body></html>\n")
-    footer = """<p class='footer'>Generated by process_cache.py; %d
-    packages mirrored. </p>
-    </body></html>\n""" % package_count
-    full_html.write(footer)
-    full_html.close()
-    simple_html.write(footer)
-    simple_html.close()
diff --git a/jeepyb/cmd/run_mirror.py b/jeepyb/cmd/run_mirror.py
index 6ae2d65..e96573f 100644
--- a/jeepyb/cmd/run_mirror.py
+++ b/jeepyb/cmd/run_mirror.py
@@ -1,6 +1,7 @@
 #! /usr/bin/env python
 # Copyright (C) 2011 OpenStack, LLC.
-# Copyright (c) 2013 Hewlett-Packard Development Company, L.P.
+# Copyright (C) 2013 Hewlett-Packard Development Company, L.P.
+# Copyright (C) 2013 OpenStack Foundation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,14 +15,23 @@
 # License for the specific language governing permissions and limitations
 # under the License.
 
-# run_mirrors reads a project config file called projects.yaml
-# It should look like:
+# run_mirror reads a YAML config file like:
+#   cache-root: /tmp/cache
 #
-# - project: PROJECT_NAME
+#   mirrors:
+#     - name: openstack
+#       projects:
+#         - https://github.com/openstack/requirements
+#       output: /tmp/mirror/openstack
+#
+#     - name: openstack-infra
+#       projects:
+#         - https://github.com/openstack-infra/config
+#       output: /tmp/mirror/openstack-infra
 #
 # The algorithm it attempts to follow is:
 #
-# for each project in projects.yaml:
+# for each project:
 #   clone if necessary and fetch origin
 #   for each project-branch:
 #     create new virtualenv
@@ -31,11 +41,8 @@
 #       create new virtualenv
 #       pip install (download only) full-reqs into virtualenv
 #
-# By default only summary information is printed on stdout, but if
-# DEFAULT is enabled in the calling environment then stdout of all
-# shell commands run is also printed. Due to its copiousness and
-# buffering, however, DEBUG level output is best suited to file
-# redirection.
+# By default only summary information is printed on stdout (see the
+# -d command line option to get more debug info).
 #
 # If "pip install" for a branch's requirements fails to complete
 # (based on parsing of its output), that output will be copied to
@@ -51,115 +58,276 @@
 import sys
 import tempfile
 import yaml
+import argparse
+import re
+import urllib
+import datetime
+import md5
 
 
-def run_command(cmd):
-    cmd_list = shlex.split(str(cmd))
-    p = subprocess.Popen(cmd_list, stdout=subprocess.PIPE,
-                         stderr=subprocess.STDOUT)
-    (out, nothing) = p.communicate()
-    return out.strip()
+class Mirror(object):
+    def __init__(self):
+        parser = argparse.ArgumentParser(
+            description='Build a pypi mirror from requirements')
+        parser.add_argument('-c', dest='config',
+                            help='specify the config file')
+        parser.add_argument('-n', dest='noop', action='store_true',
+                            help='do not run any commands')
+        parser.add_argument('--no-pip', dest='no_pip', action='store_true',
+                            help='do not run any pip commands')
+        parser.add_argument('--verbose', dest='debug', action='store_true',
+                            help='output verbose debug information')
+        parser.add_argument('--no-download', dest='no_download',
+                            action='store_true',
+                            help='only process the pip cache into a mirror '
+                            '(do not download)')
+        parser.add_argument('--no-process', dest='no_process',
+                            action='store_true',
+                            help='only download into the pip cache '
+                            '(do not process the cache into a mirror)')
+        parser.add_argument('--no-update', dest='no_update',
+                            action='store_true',
+                            help='do not update any git repos')
+        self.args = parser.parse_args()
+        self.config = yaml.load(open(self.args.config))
 
+    def run_command(self, cmd):
+        cmd_list = shlex.split(str(cmd))
+        self.debug("Run: %s" % cmd)
+        if self.args.noop:
+            return ''
+        if self.args.no_pip and cmd_list[0].endswith('pip'):
+            return ''
+        p = subprocess.Popen(cmd_list, stdout=subprocess.PIPE,
+                             stderr=subprocess.STDOUT)
+        (out, nothing) = p.communicate()
+        out = out.strip()
+        self.debug(out)
+        return out
 
-def main():
+    def run(self):
+        for mirror in self.config['mirrors']:
+            if not self.args.no_download:
+                self.build_mirror(mirror)
+            if not self.args.no_process:
+                self.process_cache(mirror)
 
-    DEBUG = True if os.environ.get('DEBUG', '').lower() in ('enabled',
-                                                            'enable',
-                                                            'true',
-                                                            'yes',
-                                                            'on',
-                                                            '1') else False
-    PROJECTS_YAML = os.environ.get('PROJECTS_YAML',
-                                   '/etc/openstackci/projects.yaml')
-    PIP_TEMP_DOWNLOAD = os.environ.get('PIP_TEMP_DOWNLOAD',
-                                       '/var/lib/pip-download')
-    PIP_DOWNLOAD_CACHE = os.environ.get('PIP_DOWNLOAD_CACHE',
-                                        '/var/cache/pip')
-    GIT_SOURCE = os.environ.get('GIT_SOURCE', 'https://github.com')
-    pip_format = "%s install -U %s --exists-action=w -r %s"
-    venv_format = ("/usr/local/bin/virtualenv --clear --distribute "
-                   "--extra-search-dir=%s %s")
+    def chdir(self, dest):
+        self.debug("cd %s" % dest)
+        if not self.args.noop:
+            os.chdir(dest)
 
-    (defaults, config) = [config for config in
-                          yaml.load_all(open(PROJECTS_YAML))]
+    def debug(self, msg):
+        if self.args.debug:
+            print msg
 
-    workdir = tempfile.mkdtemp()
-    reqs = os.path.join(workdir, "reqs")
-    venv = os.path.join(workdir, "venv")
-    pip = os.path.join(venv, "bin", "pip")
+    def process_http_requirements(self, reqlist, pip_cache_dir, pip):
+        new_reqs = []
+        for reqfile in reqlist:
+            for req in open(reqfile):
+                req = req.strip()
+                # Handle http://, https://, and git+https?://
+                if not re.search('https?://', req):
+                    new_reqs.append(req)
+                    continue
+                target_url = req.split('#', 1)[0]
+                target_file = os.path.join(pip_cache_dir,
+                                           urllib.quote(target_url, ''))
+                if os.path.exists(target_file):
+                    self.debug("Unlink: %s" % target_file)
+                    os.unlink(target_file)
+                if os.path.exists(target_file + '.content-type'):
+                    self.debug("Unlink: %s.content-type" % target_file)
+                    os.unlink(target_file + '.content-type')
+        return new_reqs
 
-    for section in config:
-        project = section['project']
-        if DEBUG:
-            print("*********************\nupdating %s repository" % project)
+    def build_mirror(self, mirror):
+        print("Building mirror: %s" % mirror['name'])
+        pip_format = ("%s install -M -U %s --exists-action=w "
+                      "--download-cache=%s -r %s")
+        venv_format = ("virtualenv --clear --distribute "
+                       "--extra-search-dir=%s %s")
 
-        os.chdir(PIP_TEMP_DOWNLOAD)
-        short_project = project.split('/')[1]
-        if not os.path.isdir(short_project):
-            out = run_command("git clone %s/%s.git %s" % (GIT_SOURCE, project,
-                                                          short_project))
-            if DEBUG:
-                print(out)
-        os.chdir(short_project)
-        out = run_command("git fetch -p origin")
-        if DEBUG:
-            print(out)
+        workdir = tempfile.mkdtemp()
+        reqs = os.path.join(workdir, "reqs")
+        venv = os.path.join(workdir, "venv")
+        pip = os.path.join(venv, "bin", "pip")
 
-        for branch in run_command("git branch -a").split("\n"):
-            branch = branch.strip()
-            if (not branch.startswith("remotes/origin")
-                    or "origin/HEAD" in branch):
-                continue
-            print("*********************")
-            print("Fetching pip requires for %s:%s" % (project, branch))
-            out = run_command("git reset --hard %s" % branch)
-            if DEBUG:
-                print(out)
-            out = run_command("git clean -x -f -d -q")
-            if DEBUG:
-                print(out)
-            reqlist = []
-            for requires_file in ("requirements.txt",
-                                  "test-requirements.txt",
-                                  "tools/pip-requires",
-                                  "tools/test-requires"):
-                if os.path.exists(requires_file):
-                    reqlist.append(requires_file)
-            if reqlist:
-                out = run_command(venv_format % (PIP_DOWNLOAD_CACHE, venv))
-                if DEBUG:
-                    print(out)
-                out = run_command(pip_format % (pip, "",
-                                                " -r ".join(reqlist)))
-                if DEBUG:
-                    print(out)
-                if "\nSuccessfully installed " not in out:
-                    sys.stderr.write("Installing pip requires for %s:%s "
-                                     "failed.\n%s\n" %
-                                     (project, branch, out))
-                    print("pip install did not indicate success")
-                else:
-                    freeze = run_command("%s freeze -l" % pip)
-                    reqfd = open(reqs, "w")
-                    for line in freeze.split("\n"):
-                        if line.startswith("-e ") or (
-                                "==" in line and " " not in line):
-                            reqfd.write(line + "\n")
-                    reqfd.close()
-                    out = run_command(venv_format % (PIP_DOWNLOAD_CACHE, venv))
-                    if DEBUG:
-                        print(out)
-                    out = run_command(pip_format % (pip, "--no-install",
-                                      reqs))
-                    if DEBUG:
-                        print(out)
-                    if "\nSuccessfully downloaded " not in out:
-                        sys.stderr.write("Downloading pip requires for %s:%s "
+        project_cache_dir = os.path.join(self.config['cache-root'],
+                                         'projects')
+        pip_cache_dir = os.path.join(self.config['cache-root'],
+                                     'pip', mirror['name'])
+        if not self.args.noop:
+            if not os.path.exists(project_cache_dir):
+                os.makedirs(project_cache_dir)
+            if not os.path.exists(pip_cache_dir):
+                os.makedirs(pip_cache_dir)
+
+        for project in mirror['projects']:
+            print("Updating repository: %s" % project)
+            self.chdir(project_cache_dir)
+            short_project = project.split('/')[-1]
+            if short_project.endswith('.git'):
+                short_project = short_project[:-4]
+            if not os.path.isdir(short_project):
+                out = self.run_command("git clone %s %s" %
+                                       (project, short_project))
+            self.chdir(os.path.join(project_cache_dir,
+                                    short_project))
+            out = self.run_command("git fetch -p origin")
+
+            for branch in self.run_command("git branch -a").split("\n"):
+                branch = branch.strip()
+                if (not branch.startswith("remotes/origin")
+                        or "origin/HEAD" in branch):
+                    continue
+                print("Fetching pip requires for %s:%s" %
+                      (project, branch))
+                if not self.args.no_update:
+                    out = self.run_command("git reset --hard %s" % branch)
+                    out = self.run_command("git clean -x -f -d -q")
+                reqlist = []
+                for requires_file in ("requirements.txt",
+                                      "test-requirements.txt",
+                                      "tools/pip-requires",
+                                      "tools/test-requires"):
+                    if os.path.exists(requires_file):
+                        reqlist.append(requires_file)
+                if reqlist:
+                    out = self.run_command(venv_format %
+                                           (pip_cache_dir, venv))
+                    new_reqs = self.process_http_requirements(reqlist,
+                                                              pip_cache_dir,
+                                                              pip)
+                    (reqfp, reqfn) = tempfile.mkstemp()
+                    os.write(reqfp, '\n'.join(new_reqs))
+                    os.close(reqfp)
+                    out = self.run_command(pip_format %
+                                           (pip, "", pip_cache_dir,
+                                            reqfn))
+                    if "\nSuccessfully installed " not in out:
+                        sys.stderr.write("Installing pip requires for %s:%s "
                                          "failed.\n%s\n" %
                                          (project, branch, out))
                         print("pip install did not indicate success")
-                    print("cached:\n%s" % freeze)
-            else:
-                print("no requirements")
+                    else:
+                        freeze = self.run_command("%s freeze -l" % pip)
+                        reqfd = open(reqs, "w")
+                        for line in freeze.split("\n"):
+                            if line.startswith("-e ") or (
+                                    "==" in line and " " not in line):
+                                reqfd.write(line + "\n")
+                        reqfd.close()
+                        out = self.run_command(venv_format %
+                                               (pip_cache_dir, venv))
+                        out = self.run_command(pip_format %
+                                               (pip, "--no-install",
+                                                pip_cache_dir, reqs))
+                        if "\nSuccessfully downloaded " not in out:
+                            sys.stderr.write("Downloading pip requires for "
+                                             "%s:%s failed.\n%s\n" %
+                                             (project, branch, out))
+                            print("pip install did not indicate success")
+                        print("cached:\n%s" % freeze)
+                else:
+                    print("no requirements")
+        shutil.rmtree(workdir)
 
-    shutil.rmtree(workdir)
+    def process_cache(self, mirror):
+        if self.args.noop:
+            return
+
+        pip_cache_dir = os.path.join(self.config['cache-root'],
+                                     'pip', mirror['name'])
+        destination_mirror = mirror['output']
+
+        PACKAGE_VERSION_RE = re.compile(r'(.*)-[0-9]')
+        full_html_line = "<a href='{dir}/{name}'>{name}</a><br />\n"
+
+        packages = {}
+        package_count = 0
+
+        if not os.path.exists(destination_mirror):
+            os.makedirs(destination_mirror)
+
+        for filename in os.listdir(pip_cache_dir):
+            if filename.endswith('content-type'):
+                continue
+
+            realname = urllib.unquote(filename)
+            # The ? accounts for sourceforge downloads
+            tarball = os.path.basename(realname).split("?")[0]
+            name_match = PACKAGE_VERSION_RE.search(tarball)
+
+            if name_match is None:
+                continue
+            package_name = name_match.group(1)
+
+            version_list = packages.get(package_name, {})
+            version_list[tarball] = filename
+            packages[package_name] = version_list
+            package_count = package_count + 1
+
+        full_html = open(os.path.join(destination_mirror, ".full.html"), 'w')
+        simple_html = open(os.path.join(destination_mirror, ".index.html"),
+                           'w')
+
+        header = ("<html><head><title>PyPI Mirror</title></head>"
+                  "<body><h1>PyPI Mirror</h1><h2>Last update: %s</h2>\n\n"
+                  % datetime.datetime.utcnow().strftime("%c UTC"))
+        full_html.write(header)
+        simple_html.write(header)
+
+        for package_name, versions in packages.items():
+            destination_dir = os.path.join(destination_mirror, package_name)
+            if not os.path.isdir(destination_dir):
+                os.makedirs(destination_dir)
+            safe_dir = urllib.quote(package_name)
+            simple_html.write("<a href='%s'>%s</a><br />\n" %
+                              (safe_dir, safe_dir))
+            with open(os.path.join(destination_dir, ".index.html"),
+                      'w') as index:
+                index.write("""<html><head>
+          <title>%s &ndash; PyPI Mirror</title>
+        </head><body>\n""" % package_name)
+                for tarball, filename in versions.items():
+                    source_path = os.path.join(pip_cache_dir, filename)
+                    destination_path = os.path.join(destination_dir,
+                                                    tarball)
+                    dot_destination_path = os.path.join(destination_dir,
+                                                        '.' + tarball)
+                    with open(dot_destination_path, 'w') as dest:
+                        src = open(source_path, 'r').read()
+                        md5sum = md5.md5(src).hexdigest()
+                        dest.write(src)
+
+                        safe_name = urllib.quote(tarball)
+
+                        full_html.write(full_html_line.format(dir=safe_dir,
+                                                              name=safe_name))
+                        index.write("<a href='%s#md5=%s'>%s</a>\n" %
+                                    (safe_name, md5sum, safe_name))
+                    os.rename(dot_destination_path, destination_path)
+                index.write("</body></html>\n")
+            os.rename(os.path.join(destination_dir, ".index.html"),
+                      os.path.join(destination_dir, "index.html"))
+        footer = """<p class='footer'>Generated by process_cache.py; %d
+        packages mirrored. </p>
+        </body></html>\n""" % package_count
+        full_html.write(footer)
+        full_html.close()
+        os.rename(os.path.join(destination_mirror, ".full.html"),
+                  os.path.join(destination_mirror, "full.html"))
+        simple_html.write(footer)
+        simple_html.close()
+        os.rename(os.path.join(destination_mirror, ".index.html"),
+                  os.path.join(destination_mirror, "index.html"))
+
+
+def main():
+    mb = Mirror()
+    mb.run()
+
+
+if __name__ == "__main__":
+    main()