Merge pull request #8 from akutz/feature/persisted-metadata

Support persisted metadata
diff --git a/DataSourceVMwareGuestInfo.py b/DataSourceVMwareGuestInfo.py
index b4d2fc0..3a3b884 100644
--- a/DataSourceVMwareGuestInfo.py
+++ b/DataSourceVMwareGuestInfo.py
@@ -1,5 +1,3 @@
-# vi: ts=4 expandtab
-#
 # Cloud-Init Datasource for VMware Guestinfo
 #
 # Copyright (c) 2018 VMware, Inc. All Rights Reserved.
@@ -15,95 +13,150 @@
 # Authors: Anish Swaminathan <anishs@vmware.com>
 #          Andrew Kutz <akutz@vmware.com>
 #
-import os
+
+'''
+A cloud init datasource for VMware GuestInfo.
+'''
+
+import collections
 import base64
 import zlib
 import json
+from distutils.spawn import find_executable
 
 from cloudinit import log as logging
 from cloudinit import sources
 from cloudinit import util
 from cloudinit import safeyaml
 
-from distutils.spawn import find_executable
-
 LOG = logging.getLogger(__name__)
+NOVAL = "No value found"
+VMTOOLSD = find_executable("vmtoolsd")
 
-# This cloud-init datasource was designed for use with CentOS 7,
-# which uses cloud-init 0.7.9. However, this datasource should
-# work with any Linux distribution for which cloud-init is
-# avaialble.
-#
-# The documentation for cloud-init 0.7.9's datasource is
-# available at http://bit.ly/cloudinit-datasource-0-7-9. The
-# current documentation for cloud-init is found at
-# https://cloudinit.readthedocs.io/en/latest/.
-#
-# Setting the hostname:
-#     The hostname is set by way of the metadata key "local-hostname".
-#
-# Setting the instance ID:
-#     The instance ID may be set by way of the metadata key "instance-id".
-#     However, if this value is absent then then the instance ID is
-#     read from the file /sys/class/dmi/id/product_uuid.
-#
-# Configuring the network:
-#     The network is configured by setting the metadata key "network"
-#     with a value consistent with Network Config Versions 1 or 2,
-#     depending on the Linux distro's version of cloud-init:
-#
-#         Network Config Version 1 - http://bit.ly/cloudinit-net-conf-v1
-#         Network Config Version 2 - http://bit.ly/cloudinit-net-conf-v2
-#
-#     For example, CentOS 7's official cloud-init package is version
-#     0.7.9 and does not support Network Config Version 2. However,
-#     this datasource still supports supplying Network Config Version 2
-#     data as long as the Linux distro's cloud-init package is new
-#     enough to parse the data.
-#
-#     The metadata key "network.encoding" may be used to indicate the
-#     format of the metadata key "network". Valid encodings are base64
-#     and gzip+base64.
+
+class NetworkConfigError(Exception):
+    '''
+    NetworkConfigError is raised when there is an issue getting or
+    applying network configuration.
+    '''
+    pass
+
+
 class DataSourceVMwareGuestInfo(sources.DataSource):
+    '''
+    This cloud-init datasource was designed for use with CentOS 7,
+    which uses cloud-init 0.7.9. However, this datasource should
+    work with any Linux distribution for which cloud-init is
+    avaialble.
+
+    The documentation for cloud-init 0.7.9's datasource is
+    available at http://bit.ly/cloudinit-datasource-0-7-9. The
+    current documentation for cloud-init is found at
+    https://cloudinit.readthedocs.io/en/latest/.
+
+    Setting the hostname:
+        The hostname is set by way of the metadata key "local-hostname".
+
+    Setting the instance ID:
+        The instance ID may be set by way of the metadata key "instance-id".
+        However, if this value is absent then then the instance ID is
+        read from the file /sys/class/dmi/id/product_uuid.
+
+    Configuring the network:
+        The network is configured by setting the metadata key "network"
+        with a value consistent with Network Config Versions 1 or 2,
+        depending on the Linux distro's version of cloud-init:
+
+            Network Config Version 1 - http://bit.ly/cloudinit-net-conf-v1
+            Network Config Version 2 - http://bit.ly/cloudinit-net-conf-v2
+
+        For example, CentOS 7's official cloud-init package is version
+        0.7.9 and does not support Network Config Version 2. However,
+        this datasource still supports supplying Network Config Version 2
+        data as long as the Linux distro's cloud-init package is new
+        enough to parse the data.
+
+        The metadata key "network.encoding" may be used to indicate the
+        format of the metadata key "network". Valid encodings are base64
+        and gzip+base64.
+    '''
+
+    dsname = 'VMwareGuestInfo'
+
     def __init__(self, sys_cfg, distro, paths, ud_proc=None):
         sources.DataSource.__init__(self, sys_cfg, distro, paths, ud_proc)
-        self.vmtoolsd = find_executable("vmtoolsd")
-        if not self.vmtoolsd:
+        if not VMTOOLSD:
             LOG.error("Failed to find vmtoolsd")
 
     def get_data(self):
-        if not self.vmtoolsd:
+        """
+        This method should really be _get_data in accordance with the most
+        recent versions of cloud-init. However, because the datasource
+        supports as far back as cloud-init 0.7.9, get_data is still used.
+
+        Because of this the method attempts to do some of the same things
+        that the get_data functions in newer versions of cloud-init do,
+        such as calling persist_instance_data.
+        """
+        if not VMTOOLSD:
             LOG.error("vmtoolsd is required to fetch guestinfo value")
             return False
 
-        # Get the JSON metadata. Can be plain-text, base64, or gzip+base64.
-        metadata = self._get_encoded_guestinfo_data('metadata')
-        if metadata:
-            try:
-                self.metadata = json.loads(metadata)
-            except:
-                self.metadata = safeyaml.load(metadata)
+        # Get the metadata.
+        self.metadata = load_metadata()
 
-        # Get the YAML userdata. Can be plain-text, base64, or gzip+base64.
-        self.userdata_raw = self._get_encoded_guestinfo_data('userdata')
+        # Get the user data.
+        self.userdata_raw = guestinfo('userdata')
 
-        # Get the YAML vendordata. Can be plain-text, base64, or gzip+base64.
-        self.vendordata_raw = self._get_encoded_guestinfo_data('vendordata')
+        # Get the vendor data.
+        self.vendordata_raw = guestinfo('vendordata')
 
         return True
 
+    def setup(self, is_new_instance):
+        """setup(is_new_instance)
+
+        This is called before user-data and vendor-data have been processed.
+
+        Unless the datasource has set mode to 'local', then networking
+        per 'fallback' or per 'network_config' will have been written and
+        brought up the OS at this point.
+        """
+
+        # Set the hostname.
+        hostname = self.metadata.get('local-hostname')
+        if hostname:
+            self.distro.set_hostname(hostname)
+            LOG.info("set hostname %s", hostname)
+
+        # Update the metadata with the actual host name and actual network
+        # interface information.
+        host_info = get_host_info()
+        LOG.info("got host-info: %s", host_info)
+        hostname = host_info.get('local-hostname', hostname)
+        self.metadata['local-hostname'] = hostname
+        interfaces = host_info['network']['interfaces']
+        self.metadata['network']['interfaces'] = interfaces
+
+        # Persist the instance data for versions of cloud-init that support
+        # doing so. This occurs here rather than in the get_data call in
+        # order to ensure that the network interfaces are up and can be
+        # persisted with the metadata.
+        try:
+            self.persist_instance_data()
+        except AttributeError:
+            pass
+
     @property
     def network_config(self):
-        # Pull the network configuration out of the metadata.
-        if self.metadata and 'network' in self.metadata:
-            data = self._get_encoded_metadata('network')
-            if data:
-                # Load the YAML-formatted network data into an object
-                # and return it.
-                net_config = safeyaml.load(data)
-                LOG.debug("Loaded network config: %s", net_config)
-                return net_config
-        return None
+        if 'network' in self.metadata:
+            LOG.debug("using metadata network config")
+        else:
+            LOG.debug("using fallback network config")
+            self.metadata['network'] = {
+                'config': self.distro.generate_fallback_config(),
+            }
+        return self.metadata['network']['config']
 
     def get_instance_id(self):
         # Pull the instance ID out of the metadata if present. Otherwise
@@ -111,84 +164,211 @@
         if self.metadata and 'instance-id' in self.metadata:
             return self.metadata['instance-id']
         with open('/sys/class/dmi/id/product_uuid', 'r') as id_file:
-            return str(id_file.read()).rstrip()
+            self.metadata['instance-id'] = str(id_file.read()).rstrip()
+            return self.metadata['instance-id']
 
-    def _get_encoded_guestinfo_data(self, key):
-        data = self._get_guestinfo_value(key)
-        if not data:
-            return None
-        enc_type = self._get_guestinfo_value(key + '.encoding')
-        return self._get_encoded_data('guestinfo.' + key, enc_type, data)
 
-    def _get_encoded_metadata(self, key):
-        if not self.metadata or not key in self.metadata:
-            return None
-        data = self.metadata[key]
-        enc_type = self.metadata.get(key + '.encoding')
-        return self._get_encoded_data('metadata.' + key, enc_type, data)
+def decode(key, enc_type, data):
+    '''
+    decode returns the decoded string value of data
+    key is a string used to identify the data being decoded in log messages
+    ----
+    In py 2.7:
+    json.loads method takes string as input
+    zlib.decompress takes and returns a string
+    base64.b64decode takes and returns a string
+    -----
+    In py 3.6 and newer:
+    json.loads method takes bytes or string as input
+    zlib.decompress takes and returns a bytes
+    base64.b64decode takes bytes or string and returns bytes
+    -----
+    In py > 3, < 3.6:
+    json.loads method takes string as input
+    zlib.decompress takes and returns a bytes
+    base64.b64decode takes bytes or string and returns bytes
+    -----
+    Given the above conditions the output from zlib.decompress and
+    base64.b64decode would be bytes with newer python and str in older
+    version. Thus we would covert the output to str before returning
+    '''
+    LOG.debug("Getting encoded data for key=%s, enc=%s", key, enc_type)
 
-    def _get_encoded_data(self, key, enc_type, data):
-        '''
-        The _get_encoded_data would always return a str
-        ----
-        In py 2.7:
-        json.loads method takes string as input
-        zlib.decompress takes and returns a string
-        base64.b64decode takes and returns a string
-        -----
-        In py 3.6 and newer:
-        json.loads method takes bytes or string as input
-        zlib.decompress takes and returns a bytes
-        base64.b64decode takes bytes or string and returns bytes
-        -----
-        In py > 3, < 3.6:
-        json.loads method takes string as input
-        zlib.decompress takes and returns a bytes
-        base64.b64decode takes bytes or string and returns bytes
-        -----
-        Given the above conditions the output from zlib.decompress and
-        base64.b64decode would be bytes with newer python and str in older
-        version. Thus we would covert the output to str before returning
-        '''
-        rawdata = self._get_encoded_data_raw(key, enc_type, data)
-        if type(rawdata) == bytes:
-            return rawdata.decode('utf-8')
-        return rawdata
+    raw_data = None
+    if enc_type == "gzip+base64" or enc_type == "gz+b64":
+        LOG.debug("Decoding %s format %s", enc_type, key)
+        raw_data = zlib.decompress(base64.b64decode(data), zlib.MAX_WBITS | 16)
+    elif enc_type == "base64" or enc_type == "b64":
+        LOG.debug("Decoding %s format %s", enc_type, key)
+        raw_data = base64.b64decode(data)
+    else:
+        LOG.debug("Plain-text data %s", key)
+        raw_data = data
 
-    def _get_encoded_data_raw(self, key, enc_type, data):
-        LOG.debug("Getting encoded data for key=%s, enc=%s", key, enc_type)
-        if enc_type == "gzip+base64" or enc_type == "gz+b64":
-            LOG.debug("Decoding %s format %s", enc_type, key)
-            return zlib.decompress(base64.b64decode(data), zlib.MAX_WBITS | 16)
-        elif enc_type == "base64" or enc_type == "b64":
-            LOG.debug("Decoding %s format %s", enc_type, key)
-            return base64.b64decode(data)
+    if isinstance(raw_data, bytes):
+        return raw_data.decode('utf-8')
+    return raw_data
+
+
+def get_guestinfo_value(key):
+    '''
+    Returns a guestinfo value for the specified key.
+    '''
+    LOG.debug("Getting guestinfo value for key %s", key)
+    try:
+        (stdout, stderr) = util.subp(
+            [VMTOOLSD, "--cmd", "info-get guestinfo." + key])
+        if stderr == NOVAL:
+            LOG.debug("No value found for key %s", key)
+        elif not stdout:
+            LOG.error("Failed to get guestinfo value for key %s", key)
         else:
-            LOG.debug("Plain-text data %s", key)
-            return data
+            return stdout.rstrip()
+    except util.ProcessExecutionError as error:
+        if error.stderr == NOVAL:
+            LOG.debug("No value found for key %s", key)
+        else:
+            util.logexc(
+                LOG, "Failed to get guestinfo value for key %s: %s", key, error)
+    except Exception:
+        util.logexc(
+            LOG, "Unexpected error while trying to get guestinfo value for key %s", key)
+    return None
 
-    def _get_guestinfo_value(self, key):
-        NOVAL = "No value found"
-        LOG.debug("Getting guestinfo value for key %s", key)
-        try:
-            (stdout, stderr) = util.subp([self.vmtoolsd, "--cmd", "info-get guestinfo." + key])
-            if stderr == NOVAL:
-                LOG.debug("No value found for key %s", key)
-            elif not stdout:
-                LOG.error("Failed to get guestinfo value for key %s", key)
-            else:
-                return stdout.rstrip()
-        except util.ProcessExecutionError as error:
-            if error.stderr == NOVAL:
-                LOG.debug("No value found for key %s", key)
-            else:
-                util.logexc(LOG,"Failed to get guestinfo value for key %s: %s", key, error)
-        except Exception:
-            util.logexc(LOG,"Unexpected error while trying to get guestinfo value for key %s", key)
+
+def guestinfo(key):
+    '''
+    guestinfo returns the guestinfo value for the provided key, decoding
+    the value when required
+    '''
+    data = get_guestinfo_value(key)
+    if not data:
         return None
+    enc_type = get_guestinfo_value(key + '.encoding')
+    return decode('guestinfo.' + key, enc_type, data)
+
+
+def load(data):
+    '''
+    load first attempts to unmarshal the provided data as JSON, and if
+    that fails then attempts to unmarshal the data as YAML. If data is
+    None then a new dictionary is returned.
+    '''
+    if not data:
+        return {}
+    try:
+        return json.loads(data)
+    except:
+        return safeyaml.load(data)
+
+
+def load_metadata():
+    '''
+    load_metadata loads the metadata from the guestinfo data, optionally
+    decoding the network config when required
+    '''
+    data = load(guestinfo('metadata'))
+
+    network = None
+    if 'network' in data:
+        network = data['network']
+        del data['network']
+
+    network_enc = None
+    if 'network.encoding' in data:
+        network_enc = data['network.encoding']
+        del data['network.encoding']
+
+    if network:
+        if not isinstance(network, collections.Mapping):
+            LOG.debug("decoding network data: %s", network)
+            dec_net = decode('metadata.network', network_enc, network)
+            network = load(dec_net)
+        if 'config' not in network:
+            raise NetworkConfigError("missing 'config' key")
+        data['network'] = network
+
+    return data
+
 
 def get_datasource_list(depends):
-    """
+    '''
     Return a list of data sources that match this set of dependencies
-    """
+    '''
     return [DataSourceVMwareGuestInfo]
+
+
+def get_host_info():
+    '''
+    Returns host information such as the host name and network interfaces.
+    '''
+    import netifaces
+    import socket
+
+    host_info = {
+        'network': {
+            'interfaces': {
+                'by-mac': collections.OrderedDict(),
+                'by-ip4': collections.OrderedDict(),
+                'by-ip6': collections.OrderedDict(),
+            },
+        },
+    }
+
+    hostname = socket.getfqdn()
+    if hostname:
+        host_info['local-hostname'] = hostname
+
+    by_mac = host_info['network']['interfaces']['by-mac']
+    by_ip4 = host_info['network']['interfaces']['by-ip4']
+    by_ip6 = host_info['network']['interfaces']['by-ip6']
+
+    ifaces = netifaces.interfaces()
+    for dev_name in ifaces:
+        addr_fams = netifaces.ifaddresses(dev_name)
+        af_link = addr_fams.get(netifaces.AF_LINK)
+        af_inet = addr_fams.get(netifaces.AF_INET)
+        af_inet6 = addr_fams.get(netifaces.AF_INET6)
+
+        mac = None
+        if af_link and 'addr' in af_link[0]:
+            mac = af_link[0]['addr']
+
+        # Do not bother recording localhost
+        if mac == "00:00:00:00:00:00":
+            continue
+
+        if mac and (af_inet or af_inet6):
+            key = mac
+            val = {}
+            if af_inet:
+                val["ip4"] = af_inet
+            if af_inet6:
+                val["ip6"] = af_inet6
+            by_mac[key] = val
+
+        if af_inet:
+            for ip_info in af_inet:
+                key = ip_info['addr']
+                val = ip_info.copy()
+                del val['addr']
+                if mac:
+                    val['mac'] = mac
+                by_ip4[key] = val
+
+        if af_inet6:
+            for ip_info in af_inet6:
+                key = ip_info['addr']
+                val = ip_info.copy()
+                del val['addr']
+                if mac:
+                    val['mac'] = mac
+                by_ip6[key] = val
+
+    return host_info
+
+
+if __name__ == "__main__":
+    print util.json_dumps(get_host_info())
+
+# vi: ts=4 expandtab