Merge pull request #47 from simonpasquier/fix-check-openstack-api-plugin

Fix check_openstack_api plugin
diff --git a/collectd/files/collectd.conf b/collectd/files/collectd.conf
index 0a64b0f..02918f1 100644
--- a/collectd/files/collectd.conf
+++ b/collectd/files/collectd.conf
@@ -40,6 +40,14 @@
 ReadThreads {{ client.read_threads }}
 {%- endif %}
 
+{%- if client.write_queue_limit_high is defined %}
+WriteQueueLimitHigh {{ client.write_queue_limit_high}}
+{%- endif %}
+
+{%- if client.write_queue_limit_low is defined %}
+WriteQueueLimitLow {{ client.write_queue_limit_low}}
+{%- endif %}
+
 ##############################################################################
 # Logging                                                                    #
 #----------------------------------------------------------------------------#
diff --git a/collectd/files/plugin/collectd_base.py b/collectd/files/plugin/collectd_base.py
index 4a9842a..4e6eaff 100644
--- a/collectd/files/plugin/collectd_base.py
+++ b/collectd/files/plugin/collectd_base.py
@@ -165,15 +165,12 @@
             non-zero status code (default=True).
 
         Returns:
-            A tuple containing the standard output and error strings if the
-            program execution has been successful.
+            A tuple containing the return code, the standard output and the
+            standard error if the program has been executed.
 
-            ("foobar\n", "")
+            (0, "foobar\n", "")
 
-            (None, "stderr of the command") if the command returned a
-            non-zero status code.
-
-            (None, None) if the command couldn't be executed at all.
+            (-1, None, None) if the program couldn't be executed at all.
         """
         start_time = time.time()
         try:
@@ -189,24 +186,19 @@
         except Exception as e:
             self.logger.error("Cannot execute command '%s': %s : %s" %
                               (cmd, str(e), traceback.format_exc()))
-            return (None, None)
+            return (-1, None, None)
 
         returncode = proc.returncode
 
-        if returncode != 0:
-            if log_error:
-                self.logger.error("Command '%s' failed (return code %d): %s" %
-                                  (cmd, returncode, stderr))
-            return (None, stderr)
+        if returncode != 0 and log_error:
+            self.logger.error("Command '%s' failed (return code %d): %s" %
+                              (cmd, returncode, stderr))
         if self.debug:
             elapsedtime = time.time() - start_time
             self.logger.info("Command '%s' returned %s in %0.3fs" %
                              (cmd, returncode, elapsedtime))
 
-        if not stdout and self.debug:
-            self.logger.info("Command '%s' returned no output!", cmd)
-
-        return (stdout, stderr)
+        return (returncode, stdout, stderr)
 
     def execute_to_json(self, *args, **kwargs):
         """Executes a program and decodes the output as a JSON string.
@@ -217,12 +209,12 @@
             A Python object or
             None if the execution of the program or JSON decoding fails.
         """
-        outputs = self.execute(*args, **kwargs)
-        if outputs:
+        (retcode, out, err) = self.execute(*args, **kwargs)
+        if retcode == 0:
             try:
-                return json.loads(outputs[0])
+                return json.loads(out)
             except ValueError as e:
-                self.logger.error("{}: document: '{}'".format(e, outputs[0]))
+                self.logger.error("{}: document: '{}'".format(e, out))
 
     @staticmethod
     def restore_sigchld():
diff --git a/collectd/files/plugin/collectd_glusterfs.py b/collectd/files/plugin/collectd_glusterfs.py
index 344d5b5..9b03e8f 100644
--- a/collectd/files/plugin/collectd_glusterfs.py
+++ b/collectd/files/plugin/collectd_glusterfs.py
@@ -59,9 +59,9 @@
 
     def itermetrics(self):
         # Collect peers' metrics
-        out, err = self.execute([GLUSTER_BINARY, 'peer', 'status'],
-                                shell=False)
-        if not out:
+        retcode, out, err = self.execute([GLUSTER_BINARY, 'peer', 'status'],
+                                         shell=False)
+        if retcode != 0:
             raise base.CheckException("Failed to execute 'gluster peer'")
 
         total = 0
@@ -107,8 +107,8 @@
 
         # Collect volumes' metrics
         cmd = [GLUSTER_BINARY, 'volume', 'status', 'all', 'detail']
-        out, err = self.execute(cmd, shell=False, log_error=False)
-        if not out:
+        retcode, out, err = self.execute(cmd, shell=False, log_error=False)
+        if retcode != 0:
             if err and vol_status_transaction_in_progress_re.match(err):
                 # "transaction already in progress" error, we assume volumes
                 # metrics are being collected on another glusterfs node, and
diff --git a/collectd/files/plugin/collectd_pacemaker.py b/collectd/files/plugin/collectd_pacemaker.py
index 682c100..87dc470 100644
--- a/collectd/files/plugin/collectd_pacemaker.py
+++ b/collectd/files/plugin/collectd_pacemaker.py
@@ -73,9 +73,9 @@
                 return 1
             return 0
 
-        out, err = self.execute([self.crm_mon_binary, '--as-xml', '-r', '-f'],
-                                shell=False)
-        if not out:
+        retcode, out, err = self.execute(
+            [self.crm_mon_binary, '--as-xml', '-r', '-f'], shell=False)
+        if retcode != 0:
             raise base.CheckException(
                 "Failed to execute crm_mon '{}'".format(err))
 
diff --git a/collectd/files/plugin/hypervisor_stats.py b/collectd/files/plugin/hypervisor_stats.py
index d03f7ae..ba6e050 100644
--- a/collectd/files/plugin/hypervisor_stats.py
+++ b/collectd/files/plugin/hypervisor_stats.py
@@ -42,21 +42,35 @@
         if 'cpu_ratio' not in self.extra_config:
             self.logger.warning('CpuAllocationRatio parameter not set')
 
-    def dispatch_value(self, name, value, host=None):
+    def dispatch_value(self, name, value, meta=None):
         v = collectd.Values(
             plugin=PLUGIN_NAME,
             type='gauge',
             type_instance=name,
             interval=INTERVAL,
             # w/a for https://github.com/collectd/collectd/issues/716
-            meta={'0': True},
+            meta=meta or {'0': True},
             values=[value]
         )
-        if host:
-            v.host = host
         v.dispatch()
 
     def collect(self):
+        nova_aggregates = {}
+        r = self.get('nova', 'os-aggregates')
+        if not r:
+            self.logger.warning("Could not get nova aggregates")
+        else:
+            aggregates_list = r.json().get('aggregates', [])
+            for agg in aggregates_list:
+                nova_aggregates[agg['name']] = {
+                    'id': agg['id'],
+                    'hosts': agg['hosts'],
+                    'metrics': {'free_vcpus': 0},
+                }
+                nova_aggregates[agg['name']]['metrics'].update(
+                    {v: 0 for v in self.VALUE_MAP.values()}
+                )
+
         r = self.get('nova', 'os-hypervisors/detail')
         if not r:
             self.logger.warning("Could not get hypervisor statistics")
@@ -69,14 +83,48 @@
             # remove domain name and keep only the hostname portion
             host = stats['hypervisor_hostname'].split('.')[0]
             for k, v in self.VALUE_MAP.iteritems():
-                self.dispatch_value(v, stats.get(k, 0), host)
-                total_stats[v] += stats.get(k, 0)
+                m_val = stats.get(k, 0)
+                self.dispatch_value(v, m_val, {'host': host})
+                total_stats[v] += m_val
+                for agg in nova_aggregates.keys():
+                    agg_hosts = nova_aggregates[agg]['hosts']
+                    if stats['hypervisor_hostname'] in agg_hosts:
+                        nova_aggregates[agg]['metrics'][v] += m_val
             if 'cpu_ratio' in self.extra_config:
+                m_vcpus = stats.get('vcpus', 0)
+                m_vcpus_used = stats.get('vcpus_used', 0)
                 free = (int(self.extra_config['cpu_ratio'] *
-                        stats.get('vcpus', 0))) - stats.get('vcpus_used', 0)
-                self.dispatch_value('free_vcpus', free, host)
+                        m_vcpus)) - m_vcpus_used
+                self.dispatch_value('free_vcpus', free, {'host': host})
                 total_stats['free_vcpus'] += free
+                for agg in nova_aggregates.keys():
+                    agg_hosts = nova_aggregates[agg]['hosts']
+                    if stats['hypervisor_hostname'] in agg_hosts:
+                        free = ((int(self.extra_config['cpu_ratio'] *
+                                     m_vcpus)) -
+                                m_vcpus_used)
+                        nova_aggregates[agg]['metrics']['free_vcpus'] += free
 
+        # Dispatch the aggregate metrics
+        for agg in nova_aggregates.keys():
+            agg_id = nova_aggregates[agg]['id']
+            agg_total_free_ram = (
+                nova_aggregates[agg]['metrics']['free_ram_MB'] +
+                nova_aggregates[agg]['metrics']['used_ram_MB']
+            )
+            # Only emit metric when value is > 0
+            # If this is not the case, (for instance when no host
+            # in aggregate), this requires the corresponding alarms to
+            # have a 'skip' no_data_policy, so as not to be triggered
+            if agg_total_free_ram > 0:
+                nova_aggregates[agg]['metrics']['free_ram_percent'] = round(
+                    (100.0 * nova_aggregates[agg]['metrics']['free_ram_MB']) /
+                    agg_total_free_ram,
+                    2)
+            for k, v in nova_aggregates[agg]['metrics'].iteritems():
+                self.dispatch_value('aggregate_{}'.format(k), v,
+                                    {'aggregate': agg,
+                                     'aggregate_id': agg_id})
         # Dispatch the global metrics
         for k, v in total_stats.iteritems():
             self.dispatch_value('total_{}'.format(k), v)
diff --git a/metadata/service/client/init.yml b/metadata/service/client/init.yml
index 6ef1257..0ed4a04 100644
--- a/metadata/service/client/init.yml
+++ b/metadata/service/client/init.yml
@@ -7,4 +7,6 @@
     client:
       enabled: true
       read_interval: 60
+      write_queue_limit_high: 10000
+      write_queue_limit_low: 10000
       use_fqdn: true
diff --git a/metadata/service/remote_client/cluster.yml b/metadata/service/remote_client/cluster.yml
index b1d8d34..238f8b0 100644
--- a/metadata/service/remote_client/cluster.yml
+++ b/metadata/service/remote_client/cluster.yml
@@ -8,5 +8,7 @@
       enabled: true
       read_interval: 10
       read_threads: 10
+      write_queue_limit_high: 10000
+      write_queue_limit_low: 10000
       use_fqdn: false
       automatic_starting: false