Merge pull request #1 from tcpcloud/collectd_update
Global collectd update
diff --git a/README.rst b/README.rst
index 96b4f97..b5ea9c6 100644
--- a/README.rst
+++ b/README.rst
@@ -7,6 +7,9 @@
Sample pillars
==============
+Data writers
+------------
+
Send data over TCP to Graphite Carbon
.. code-block:: yaml
@@ -16,7 +19,8 @@
enabled: true
read_interval: 60
backend:
- carbon:
+ carbon_service:
+ engine: carbon
host: carbon1.comain.com
port: 2003
@@ -29,12 +33,32 @@
enabled: true
read_interval: 60
backend:
- amqp:
+ amqp_broker:
+ engine: amqp
host: broker1.comain.com
port: 5672
user: monitor
password: amqp-pwd
- virtual_host: '/monitor'
+ virtual_host: '/monitor'
+
+Send data over HTTP
+
+.. code-block:: yaml
+
+ collectd:
+ client:
+ enabled: true
+ read_interval: 60
+ backend:
+ http_service:
+ engine: http
+ host: service.comain.com
+ port: 8123
+
+
+Data collectors
+---------------
+
Monitor network devices, defined in 'external' dictionary
diff --git a/collectd/client.sls b/collectd/client.sls
index ae0013b..9b93af6 100644
--- a/collectd/client.sls
+++ b/collectd/client.sls
@@ -51,6 +51,10 @@
- makedirs: true
- user: root
+/usr/lib/collectd-python:
+ file.recurse:
+ - source: salt://collectd/files/plugin
+
{%- set service_grains = {'collectd': {'plugin': {}}} %}
{%- for service_name, service in pillar.items() %}
{%- if service.get('_support', {}).get('collectd', {}).get('enabled', False) %}
@@ -84,6 +88,8 @@
{%- for plugin_name, plugin in service_grains.collectd.plugin.iteritems() %}
+{%- if (plugin.get('execution', 'local') == 'local' or client.remote_collector) and plugin.get('plugin', 'native') not in ['python'] %}
+
{{ client.config_dir }}/{{ plugin_name }}.conf:
file.managed:
{%- if plugin.template is defined %}
@@ -92,7 +98,7 @@
- defaults:
plugin: {{ plugin|yaml }}
{%- else %}
- - contents: "LoadPlugin {{ plugin.plugin }}\n"
+ - contents: "<LoadPlugin {{ plugin.plugin }}>\n Globals false\n</LoadPlugin>\n"
{%- endif %}
- user: root
- mode: 660
@@ -103,8 +109,43 @@
- watch_in:
- service: collectd_service
+{%- endif %}
+
{%- endfor %}
+{%- if client.file_logging %}
+
+/etc/collectd/conf.d/00_collectd_logfile.conf:
+ file.managed:
+ - source: salt://collectd/files/collectd_logfile.conf
+ - user: root
+ - group: root
+ - mode: 660
+ - watch_in:
+ - service: collectd_service
+ - require:
+ - file: collectd_client_conf_dir
+ - require_in:
+ - file: collectd_client_conf_dir_clean
+
+{%- endif %}
+
+/etc/collectd/conf.d/collectd_python.conf:
+ file.managed:
+ - source: salt://collectd/files/collectd_python.conf
+ - template: jinja
+ - user: root
+ - group: root
+ - mode: 660
+ - defaults:
+ plugin: {{ service_grains.collectd.plugin|yaml }}
+ - watch_in:
+ - service: collectd_service
+ - require:
+ - file: collectd_client_conf_dir
+ - require_in:
+ - file: collectd_client_conf_dir_clean
+
/etc/collectd/filters.conf:
file.managed:
- source: salt://collectd/files/filters.conf
diff --git a/collectd/files/backend/amqp.conf b/collectd/files/backend/amqp.conf
index a6d6806..606f446 100644
--- a/collectd/files/backend/amqp.conf
+++ b/collectd/files/backend/amqp.conf
@@ -1,19 +1,19 @@
-{%- from "linux/map.jinja" import system with context %}
{%- set backend = salt['pillar.get']('collectd:client:backend:'+backend_name) %}
+<LoadPlugin amqp>
+ Globals false
+</LoadPlugin>
-LoadPlugin amqp
-
-<Plugin "amqp">
- <Publish "graphite">
- Host "{{ backend.host }}"
- Port "{{ backend.port }}"
- VHost "{{ backend.virtual_host }}"
- User "{{ backend.user }}"
- Password "{{ backend.password }}"
- Exchange "metrics"
- Persistent true
- Format "Graphite"
- GraphitePrefix "{{ system.get('cluster', 'prd') }}_{{ system.get('environment', 'prd') }}."
- StoreRates false
- </Publish>
+<Plugin amqp>
+ <Publish "graphite">
+ Host "{{ backend.host }}"
+ Port "{{ backend.port }}"
+ VHost "{{ backend.virtual_host }}"
+ User "{{ backend.user }}"
+ Password "{{ backend.password }}"
+ Exchange "metrics"
+ Persistent true
+ Format "Graphite"
+ GraphitePrefix "{{ backend.get('prefix', 'default_prd.') }}"
+ StoreRates false
+ </Publish>
</Plugin>
diff --git a/collectd/files/backend/carbon.conf b/collectd/files/backend/carbon.conf
index f0843d7..d39f4a4 100644
--- a/collectd/files/backend/carbon.conf
+++ b/collectd/files/backend/carbon.conf
@@ -1,18 +1,18 @@
-{%- from "linux/map.jinja" import system with context %}
{%- set backend = salt['pillar.get']('collectd:client:backend:'+backend_name) %}
-
-LoadPlugin write_graphite
+<LoadPlugin write_graphite>
+ Globals false
+</LoadPlugin>
<Plugin write_graphite>
- <Carbon>
- Host "{{ backend.host }}"
- Port "{{ backend.port }}"
- Prefix "{{ system.get('cluster', 'prd') }}_{{ system.get('environment', 'prd') }}."
- Postfix ""
- StoreRates false
- AlwaysAppendDS false
- EscapeCharacter "_"
- SeparateInstances true
- Protocol "{{ backend.get('protocol', 'tcp') }}"
- </Carbon>
+ <Carbon>
+ Host "{{ backend.host }}"
+ Port "{{ backend.port }}"
+ Prefix "{{ backend.get('prefix', 'default_prd.') }}"
+ Postfix ""
+ StoreRates false
+ AlwaysAppendDS false
+ EscapeCharacter "_"
+ SeparateInstances true
+ Protocol "{{ backend.get('protocol', 'tcp') }}"
+ </Carbon>
</Plugin>
diff --git a/collectd/files/backend/collectd.conf b/collectd/files/backend/collectd.conf
deleted file mode 100644
index aed68c3..0000000
--- a/collectd/files/backend/collectd.conf
+++ /dev/null
@@ -1,34 +0,0 @@
-LoadPlugin network
-{%- for plugin in pillar.collectd.client.plugins %}
-{%- if plugin_name == plugin.name %}
-<Plugin "network">
- {% if plugin.mode == 'client' %}
- Server "{{ plugin.host }}" "{{ plugin.port }}"
-# <Server "{{ plugin.host }}" "{{ plugin.port }}">
-# SecurityLevel Encrypt
-# Username "{{ plugin.user }}"
-# Password "{{ plugin.password }}"
-# </Server>
- TimeToLive 128
- {% endif %}
- {% if plugin.mode == 'server' %}
- Listen "{{ plugin.host }}" "{{ plugin.port }}"
-# <Listen "{{ plugin.host }}" "{{ plugin.port }}">
-# SecurityLevel Sign
-# AuthFile "/etc/collectd/passwd"
-# Interface "{{ plugin.interface }}"
-# </Listen>
- MaxPacketSize 1024
- {% endif %}
-
-# # proxy setup (client and server as above):
-# Forward true
-#
-# # statistics about the network plugin itself
-# ReportStats false
-#
-# # "garbage collection"
-# CacheFlush 1800
-</Plugin>
-{%- endif %}
-{%- endfor %}
diff --git a/collectd/files/backend/http.conf b/collectd/files/backend/http.conf
new file mode 100644
index 0000000..66401be
--- /dev/null
+++ b/collectd/files/backend/http.conf
@@ -0,0 +1,11 @@
+{%- set backend = salt['pillar.get']('collectd:client:backend:'+backend_name) %}
+<LoadPlugin write_http>
+ Globals false
+</LoadPlugin>
+
+<Plugin write_http>
+ <URL "http://{{ backend.host }}:{{ backend.port }}">
+ Format "{{ backend.get('format', 'json')|upper }}"
+ StoreRates {{ backend.get('store_rates', True)|lower }}
+ </URL>
+</Plugin>
diff --git a/collectd/files/backend/network.conf b/collectd/files/backend/network.conf
new file mode 100644
index 0000000..08fcd63
--- /dev/null
+++ b/collectd/files/backend/network.conf
@@ -0,0 +1,36 @@
+{%- from "collectd/map.jinja" import client with context %}
+<LoadPlugin network>
+ Globals false
+</LoadPlugin>
+
+{%- for backend_name in client.backend.iteritems() %}
+<Plugin network>
+ {%- if backend.mode == 'client' %}
+ Server "{{ backend.host }}" "{{ backend.port }}"
+# <Server "{{ backend.host }}" "{{ backend.port }}">
+# SecurityLevel Encrypt
+# Username "{{ backend.user }}"
+# Password "{{ backend.password }}"
+# </Server>
+ TimeToLive 128
+ {%- endif %}
+ {%- if backend.mode == 'server' %}
+ Listen "{{ backend.host }}" "{{ backend.port }}"
+# <Listen "{{ backend.host }}" "{{ backend.port }}">
+# SecurityLevel Sign
+# AuthFile "/etc/collectd/passwd"
+# Interface "{{ backend.interface }}"
+# </Listen>
+ MaxPacketSize 1024
+ {%- endif %}
+
+# # proxy setup (client and server as above):
+# Forward true
+#
+# # statistics about the network backend itself
+# ReportStats false
+#
+# # "garbage collection"
+# CacheFlush 1800
+</Plugin>
+{%- endfor %}
diff --git a/collectd/files/collectd.conf b/collectd/files/collectd.conf
index c55c8a1..d113f04 100644
--- a/collectd/files/collectd.conf
+++ b/collectd/files/collectd.conf
@@ -878,9 +878,15 @@
# <Node>
#</Plugin>
+{%- if client.file_logging %}
+Include "/etc/collectd/conf.d/00_collectd_logfile.conf"
+{%- endif %}
{%- for plugin_name, plugin in service_grains.collectd.plugin.iteritems() %}
+{%- if (plugin.get('execution', 'local') == 'local' or client.remote_collector) and plugin.get('plugin', 'native') not in ['python'] %}
Include "{{ client.config_dir }}/{{ plugin_name }}.conf"
+{%- endif %}
{%- endfor %}
+Include "{{ client.config_dir }}/collectd_python.conf"
{%- for backend_name, backend in client.backend.iteritems() %}
Include "{{ client.config_dir }}/collectd_writer_{{ backend_name }}.conf"
{%- endfor %}
diff --git a/collectd/files/collectd_check_local_endpoint.conf b/collectd/files/collectd_check_local_endpoint.conf
new file mode 100644
index 0000000..4144e96
--- /dev/null
+++ b/collectd/files/collectd_check_local_endpoint.conf
@@ -0,0 +1,17 @@
+
+{%- if plugin.get('endpoint', {})|length > 0 %}
+
+Import "check_local_endpoint"
+
+<Module "check_local_endpoint">
+ MaxRetries "3"
+ Timeout "1"
+ {%- for endpoint_name, endpoint in plugin.endpoint.iteritems() %}
+ ExpectedCode "{{ endpoint_name }}" "{{ endpoint.expected_code }}"
+ {%- endfor %}
+ {%- for endpoint_name, endpoint in plugin.endpoint.iteritems() %}
+ Url "{{ endpoint_name }}" "{{ endpoint.url }}"
+ {%- endfor %}
+</Module>
+
+{%- endif %}
\ No newline at end of file
diff --git a/collectd/files/collectd_curl.conf b/collectd/files/collectd_curl.conf
index a0f97f9..f80d684 100644
--- a/collectd/files/collectd_curl.conf
+++ b/collectd/files/collectd_curl.conf
@@ -1,4 +1,6 @@
-LoadPlugin curl
+<LoadPlugin curl>
+ Globals false
+</LoadPlugin>
<Plugin curl>
{%- for data_name, data in plugin.data.iteritems() %}
diff --git a/collectd/files/collectd_logfile.conf b/collectd/files/collectd_logfile.conf
new file mode 100644
index 0000000..55bfbff
--- /dev/null
+++ b/collectd/files/collectd_logfile.conf
@@ -0,0 +1,10 @@
+<LoadPlugin logfile>
+ Globals false
+</LoadPlugin>
+
+<Plugin logfile>
+ LogLevel warning
+ File "/var/log/collectd.log"
+ Timestamp true
+ PrintSeverity false
+</Plugin>
diff --git a/collectd/files/collectd_nginx.conf b/collectd/files/collectd_nginx.conf
deleted file mode 100644
index c581b19..0000000
--- a/collectd/files/collectd_nginx.conf
+++ /dev/null
@@ -1,11 +0,0 @@
-
-{%- if salt['pillar.get']('nginx:server:site:nginx_stats_server') %}
-{%- set stats_server = salt['pillar.get']('nginx:server:site:nginx_stats_server') %}
-
-LoadPlugin nginx
-<Plugin "nginx">
- URL "http://localhost:{{ stats_server.host.port }}"
-</Plugin>
-
-{%- endif %}
-
diff --git a/collectd/files/collectd_ping.conf b/collectd/files/collectd_ping.conf
index 5c2687c..b06fada 100644
--- a/collectd/files/collectd_ping.conf
+++ b/collectd/files/collectd_ping.conf
@@ -1,7 +1,9 @@
-LoadPlugin ping
+<LoadPlugin ping>
+ Globals false
+</LoadPlugin>
{%- for data_name, data in plugin.data.iteritems() %}
-<Plugin "ping">
+<Plugin ping>
Host "{{ data.host }}"
</Plugin>
{%- endfor %}
diff --git a/collectd/files/collectd_processes.conf b/collectd/files/collectd_processes.conf
index d369a3a..f93455a 100644
--- a/collectd/files/collectd_processes.conf
+++ b/collectd/files/collectd_processes.conf
@@ -1,6 +1,13 @@
-LoadPlugin processes
+<LoadPlugin processes>
+ Globals false
+</LoadPlugin>
+
<Plugin processes>
- {%- for process_name, process in plugin.process.iteritems() %}
- ProcessMatch "{{ process_name }}" "{{ process.match }}"
- {%- endfor %}
+ {%- for process_name, process in plugin.process.iteritems() %}
+ {%- if process.match is defined %}
+ ProcessMatch "{{ process_name }}" "{{ process.match }}"
+ {%- else %}
+ Process "{{ process_name }}"
+ {%- endif %}
+ {%- endfor %}
</Plugin>
diff --git a/collectd/files/collectd_python.conf b/collectd/files/collectd_python.conf
new file mode 100644
index 0000000..6142345
--- /dev/null
+++ b/collectd/files/collectd_python.conf
@@ -0,0 +1,16 @@
+<LoadPlugin python>
+ Globals false
+</LoadPlugin>
+
+<Plugin python>
+ ModulePath "/usr/lib/collectd-python"
+ LogTraces false
+ Interactive false
+
+ {%- for plugin_name, plugin in plugin.iteritems() %}
+ {%- if (plugin.get('execution', 'local') == 'local' or client.remote_collector) and plugin.get('plugin', 'native') == 'python' %}
+ {%- include plugin.template %}
+ {%- endif %}
+ {%- endfor %}
+
+</Plugin>
diff --git a/collectd/files/collectd_snmp.conf b/collectd/files/collectd_snmp.conf
index 6856317..5ef5227 100644
--- a/collectd/files/collectd_snmp.conf
+++ b/collectd/files/collectd_snmp.conf
@@ -1,4 +1,7 @@
-LoadPlugin snmp
+<LoadPlugin snmp>
+ Globals false
+</LoadPlugin>
+
<Plugin snmp>
{%- for data_name, data in plugin.data.iteritems() %}
<Data "{{ data_name }}">
diff --git a/collectd/files/plugin/build_ceph_perf_types.py b/collectd/files/plugin/build_ceph_perf_types.py
new file mode 100755
index 0000000..b7c0eb0
--- /dev/null
+++ b/collectd/files/plugin/build_ceph_perf_types.py
@@ -0,0 +1,83 @@
+#!/usr/bin/python
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import subprocess
+import sys
+
+
+class CephPerfCollectionSchema(object):
+
+ def __init__(self, collection, schema):
+ self.collection = collection
+ self.schema = schema
+
+ def __str__(self):
+ def sanitize(s):
+ return s.replace('::', '_').replace('-', '_').lower()
+
+ return '\n'.join(['%s_%s value:GAUGE:U:U' % (sanitize(self.collection),
+ sanitize(k))
+ for k in sorted(self.schema.iterkeys())])
+
+
+class CephPerfSchema(object):
+
+ def __init__(self, socket_path):
+ self.socket_path = socket_path
+
+ @staticmethod
+ def run_command(cmd):
+ try:
+ proc = subprocess.Popen(
+ cmd,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE
+ )
+ (stdout, stderr) = proc.communicate()
+ stdout = stdout.rstrip('\n')
+ except Exception as e:
+ print("Cannot execute command '%s': %s" % (cmd, str(e)))
+ raise e
+
+ return json.loads(stdout)
+
+ def ceph_version(self):
+ cmd = ['/usr/bin/ceph', '--admin-daemon', self.socket_path, 'version']
+ return self.run_command(cmd).get('version')
+
+ def itertypes(self):
+ cmd = ['/usr/bin/ceph', '--admin-daemon', self.socket_path, 'perf',
+ 'schema']
+
+ for collection, schema in self.run_command(cmd).iteritems():
+ yield CephPerfCollectionSchema(collection, schema)
+
+
+def main():
+ script_name = os.path.basename(sys.argv[0])
+ if len(sys.argv) < 2 or len(sys.argv) > 3:
+ print("usage: %s <Ceph OSD socket> [namespace]" % script_name)
+ else:
+ schema = CephPerfSchema(sys.argv[1])
+ collection = sys.argv[2] if len(sys.argv) == 3 else None
+ print("# File generated automatically by the %s script" % script_name)
+ print("# Ceph version: %s" % schema.ceph_version())
+ for item in schema.itertypes():
+ if collection is None or item.collection == collection:
+ print(item)
+
+if __name__ == '__main__':
+ main()
diff --git a/collectd/files/plugin/ceph_osd_perf.py b/collectd/files/plugin/ceph_osd_perf.py
new file mode 100644
index 0000000..3919611
--- /dev/null
+++ b/collectd/files/plugin/ceph_osd_perf.py
@@ -0,0 +1,117 @@
+#!/usr/bin/python
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collectd
+import glob
+import re
+
+import collectd_base as base
+
+INTERVAL = 60
+RE_OSD_ID = re.compile(".*?osd\.(\d+)\.asok$")
+
+
+class CephOSDPerfPlugin(base.CephBase):
+ """Collect OSD performance counters of OSD daemons running on the host."""
+
+ # Collect only metrics from the 'osd' namespace
+ PREFIXES = ('osd')
+
+ def __init__(self, *args, **kwargs):
+ super(CephOSDPerfPlugin, self).__init__(*args, **kwargs)
+ self.plugin = 'ceph_osd_perf'
+ self.socket_glob = None
+
+ def config_callback(self, conf):
+ super(CephOSDPerfPlugin, self).config_callback(conf)
+ for node in conf.children:
+ if node.key == "AdminSocket":
+ self.socket_glob = node.values[0]
+
+ if not self.socket_glob:
+ raise Exception("AdminSocket not defined")
+
+ @staticmethod
+ def convert_to_collectd_value(value):
+ # See for details
+ # https://www.mail-archive.com/ceph-users@lists.ceph.com/msg18705.html
+ if isinstance(value, dict):
+ if value['avgcount'] > 0:
+ return value['sum'] / value['avgcount']
+ else:
+ return 0.0
+ else:
+ return value
+
+ @staticmethod
+ def convert_to_collectd_type(*args):
+ return '_'.join([s.replace('::', '_').replace('-', '_').lower() for s
+ in args])
+
+ def itermetrics(self):
+ check_errors = []
+ checked = 0
+ for socket_name in glob.glob(self.socket_glob):
+ m = RE_OSD_ID.match(socket_name)
+ if not m:
+ continue
+
+ osd_id = m.group(1)
+ perf_dump = self.execute_to_json('ceph --admin-daemon %s perf dump'
+ % socket_name)
+ if not perf_dump:
+ check_errors.append(osd_id)
+ continue
+
+ checked += 1
+ for prefix, stats in perf_dump.iteritems():
+ if prefix not in self.PREFIXES or not stats:
+ continue
+
+ for k in sorted(stats.iterkeys()):
+ yield {
+ 'type': self.convert_to_collectd_type(prefix, k),
+ 'type_instance': osd_id,
+ 'values': self.convert_to_collectd_value(stats[k])
+ }
+
+ if check_errors:
+ raise base.CheckException(
+ "Fail to run 'ceph perf dump' for OSD(s): {}".format(
+ ', '.join(check_errors)))
+
+ if checked == 0:
+ raise base.CheckException(
+ 'Could not find any OSD socket in {}'.format(self.socket_glob)
+ )
+
+
+plugin = CephOSDPerfPlugin(collectd, 'ceph_osd')
+
+
+def init_callback():
+ plugin.restore_sigchld()
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def read_callback():
+ plugin.read_callback()
+
+collectd.register_init(init_callback)
+collectd.register_config(config_callback)
+collectd.register_read(read_callback, INTERVAL)
diff --git a/collectd/files/plugin/ceph_osd_stats.py b/collectd/files/plugin/ceph_osd_stats.py
new file mode 100644
index 0000000..cf5711c
--- /dev/null
+++ b/collectd/files/plugin/ceph_osd_stats.py
@@ -0,0 +1,67 @@
+#!/usr/bin/python
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collectd
+
+import collectd_base as base
+
+INTERVAL = 60
+
+
+class CephOSDStatsPlugin(base.CephBase):
+ """ Collect per OSD stats about store size and commit latency."""
+
+ def __init__(self, *args, **kwargs):
+ super(CephOSDStatsPlugin, self).__init__(*args, **kwargs)
+ self.plugin = 'ceph_osd'
+
+ def itermetrics(self):
+ osd_stats = self.execute_to_json('ceph pg dump osds --format json')
+ if not osd_stats:
+ raise base.CheckException("Fail to execute 'pg dump osds'")
+
+ for osd in osd_stats:
+ osd_id = osd['osd']
+
+ yield {
+ 'type_instance': osd_id,
+ 'type': 'osd_space',
+ 'values': [osd['kb_used'] * 1000, osd['kb'] * 1000],
+ }
+
+ yield {
+ 'type_instance': osd_id,
+ 'type': 'osd_latency',
+ 'values': [osd['fs_perf_stat']['apply_latency_ms'],
+ osd['fs_perf_stat']['commit_latency_ms']],
+ }
+
+plugin = CephOSDStatsPlugin(collectd, 'ceph_mon')
+
+
+def init_callback():
+ plugin.restore_sigchld()
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def read_callback():
+ plugin.read_callback()
+
+collectd.register_init(init_callback)
+collectd.register_config(config_callback)
+collectd.register_read(read_callback, INTERVAL)
diff --git a/collectd/files/plugin/ceph_pg_mon_status.py b/collectd/files/plugin/ceph_pg_mon_status.py
new file mode 100644
index 0000000..c2bf47e
--- /dev/null
+++ b/collectd/files/plugin/ceph_pg_mon_status.py
@@ -0,0 +1,97 @@
+#!/usr/bin/python
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collectd
+
+import collectd_base as base
+
+INTERVAL = 30
+HEALTH_MAP = {
+ 'HEALTH_OK': 1,
+ 'HEALTH_WARN': 2,
+ 'HEALTH_ERR': 3,
+}
+
+
+class CephMonPlugin(base.CephBase):
+ """ Collect states and metrics about ceph cluster and placement groups."""
+
+ def __init__(self, *args, **kwargs):
+ super(CephMonPlugin, self).__init__(*args, **kwargs)
+ self.plugin = 'ceph_mon'
+
+ def itermetrics(self):
+ status = self.execute_to_json('ceph -s --format json')
+ if not status:
+ raise base.CheckException("Fail to execute 'ceph -s'")
+
+ yield {
+ 'type': 'health',
+ 'values': HEALTH_MAP[status['health']['overall_status']],
+ }
+
+ if 'mons' in status['monmap']:
+ monitor_nb = len(status['monmap']['mons'])
+ else:
+ monitor_nb = 0
+ yield {
+ 'type': 'monitor_count',
+ 'values': monitor_nb
+ }
+
+ yield {
+ 'type': 'quorum_count',
+ 'values': len(status.get('quorum', []))
+ }
+
+ pgmap = status['pgmap']
+ yield {
+ 'type': 'pg_bytes',
+ 'values': [pgmap['bytes_used'], pgmap['bytes_avail'],
+ pgmap['bytes_total']],
+ }
+ yield {
+ 'type': 'pg_data_bytes',
+ 'values': pgmap['data_bytes']
+ }
+ yield {
+ 'type': 'pg_count',
+ 'values': pgmap['num_pgs']
+ }
+
+ for state in pgmap['pgs_by_state']:
+ yield {
+ 'type': 'pg_state_count',
+ 'type_instance': state['state_name'],
+ 'values': state['count']
+ }
+
+plugin = CephMonPlugin(collectd, 'ceph_mon')
+
+
+def init_callback():
+ plugin.restore_sigchld()
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def read_callback():
+ plugin.read_callback()
+
+collectd.register_init(init_callback)
+collectd.register_config(config_callback)
+collectd.register_read(read_callback, INTERVAL)
diff --git a/collectd/files/plugin/ceph_pool_osd.py b/collectd/files/plugin/ceph_pool_osd.py
new file mode 100644
index 0000000..e5a5044
--- /dev/null
+++ b/collectd/files/plugin/ceph_pool_osd.py
@@ -0,0 +1,136 @@
+#!/usr/bin/python
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collectd
+
+import collectd_base as base
+
+INTERVAL = 60
+
+
+class CephPoolPlugin(base.CephBase):
+ """ Collect Ceph pool metrics and OSD daemons state"""
+
+ def __init__(self, *args, **kwargs):
+ super(CephPoolPlugin, self).__init__(*args, **kwargs)
+ self.plugin = 'ceph_pool'
+
+ def itermetrics(self):
+ df = self.execute_to_json('ceph df --format json')
+ if not df:
+ raise base.CheckException("Fail to run 'ceph df'")
+
+ objects_count = 0
+ for pool in df['pools']:
+ objects_count += pool['stats'].get('objects', 0)
+ for m in ('bytes_used', 'max_avail', 'objects'):
+ yield {
+ 'type': 'pool_%s' % m,
+ 'type_instance': pool['name'],
+ 'values': pool['stats'].get(m, 0),
+ }
+
+ yield {
+ 'type': 'objects_count',
+ 'values': objects_count
+ }
+ yield {
+ 'type': 'pool_count',
+ 'values': len(df['pools'])
+ }
+
+ if 'total_bytes' in df['stats']:
+ # compatibility with 0.84+
+ total = df['stats']['total_bytes']
+ used = df['stats']['total_used_bytes']
+ avail = df['stats']['total_avail_bytes']
+ else:
+ # compatibility with <0.84
+ total = df['stats']['total_space'] * 1024
+ used = df['stats']['total_used'] * 1024
+ avail = df['stats']['total_avail'] * 1024
+
+ yield {
+ 'type': 'pool_total_bytes',
+ 'values': [used, avail, total]
+ }
+ yield {
+ 'type': 'pool_total_percent',
+ 'values': [100.0 * used / total, 100.0 * avail / total]
+ }
+
+ stats = self.execute_to_json('ceph osd pool stats --format json')
+ if not stats:
+ raise base.CheckException("Fail to run 'ceph osd pool stats'")
+
+ for pool in stats:
+ client_io_rate = pool.get('client_io_rate', {})
+ yield {
+ 'type': 'pool_bytes_rate',
+ 'type_instance': pool['pool_name'],
+ 'values': [client_io_rate.get('read_bytes_sec', 0),
+ client_io_rate.get('write_bytes_sec', 0)]
+ }
+ yield {
+ 'type': 'pool_ops_rate',
+ 'type_instance': pool['pool_name'],
+ 'values': client_io_rate.get('op_per_sec', 0)
+ }
+
+ osd = self.execute_to_json('ceph osd dump --format json')
+ if not osd:
+ raise base.CheckException("Fail to run 'ceph osd dump'")
+
+ for pool in osd['pools']:
+ for name in ('size', 'pg_num', 'pg_placement_num'):
+ yield {
+ 'type': 'pool_%s' % name,
+ 'type_instance': pool['pool_name'],
+ 'values': pool[name]
+ }
+
+ _up, _down, _in, _out = (0, 0, 0, 0)
+ for osd in osd['osds']:
+ if osd['up'] == 1:
+ _up += 1
+ else:
+ _down += 1
+ if osd['in'] == 1:
+ _in += 1
+ else:
+ _out += 1
+
+ yield {
+ 'type': 'osd_count',
+ 'values': [_up, _down, _in, _out]
+ }
+
+plugin = CephPoolPlugin(collectd, 'ceph_mon')
+
+
+def init_callback():
+ plugin.restore_sigchld()
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def read_callback():
+ plugin.read_callback()
+
+collectd.register_init(init_callback)
+collectd.register_config(config_callback)
+collectd.register_read(read_callback, INTERVAL)
diff --git a/collectd/files/plugin/check_local_endpoint.py b/collectd/files/plugin/check_local_endpoint.py
new file mode 100644
index 0000000..e01715f
--- /dev/null
+++ b/collectd/files/plugin/check_local_endpoint.py
@@ -0,0 +1,47 @@
+#!/usr/bin/python
+# Copyright 2016 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collectd
+import collectd_base as base
+import http_check
+
+
+NAME = 'check_local_endpoint'
+
+
+class CheckLocalEndpoint(http_check.HTTPCheckPlugin):
+
+ def __init__(self, *args, **kwargs):
+ super(CheckLocalEndpoint, self).__init__(*args, **kwargs)
+ self.plugin = NAME
+
+plugin = CheckLocalEndpoint(collectd)
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def notification_callback(notification):
+ plugin.notification_callback(notification)
+
+
+def read_callback():
+ plugin.conditional_read_callback()
+
+
+collectd.register_config(config_callback)
+collectd.register_notification(notification_callback)
+collectd.register_read(read_callback, base.INTERVAL)
diff --git a/collectd/files/plugin/check_openstack_api.py b/collectd/files/plugin/check_openstack_api.py
new file mode 100644
index 0000000..9d2ef59
--- /dev/null
+++ b/collectd/files/plugin/check_openstack_api.py
@@ -0,0 +1,129 @@
+#!/usr/bin/python
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Collectd plugin for checking the status of OpenStack API services
+import collectd
+
+import collectd_openstack as openstack
+
+from urlparse import urlparse
+
+PLUGIN_NAME = 'check_openstack_api'
+INTERVAL = openstack.INTERVAL
+
+
+class APICheckPlugin(openstack.CollectdPlugin):
+ """Class to check the status of OpenStack API services."""
+
+ # TODO(all): sahara, murano
+ CHECK_MAP = {
+ 'keystone': {
+ 'path': '/', 'expect': [300], 'name': 'keystone-public-api'},
+ 'heat': {'path': '/', 'expect': [300], 'name': 'heat-api'},
+ 'heat-cfn': {'path': '/', 'expect': [300], 'name': 'heat-cfn-api'},
+ 'glance': {'path': '/', 'expect': [300], 'name': 'glance-api'},
+ # Since Mitaka, Cinder returns 300 instead of 200 in previous releases
+ 'cinder': {'path': '/', 'expect': [200, 300], 'name': 'cinder-api'},
+ 'cinderv2': {
+ 'path': '/', 'expect': [200, 300], 'name': 'cinder-v2-api'},
+ 'neutron': {'path': '/', 'expect': [200], 'name': 'neutron-api'},
+ 'nova': {'path': '/', 'expect': [200], 'name': 'nova-api'},
+ # Ceilometer requires authentication for all paths
+ 'ceilometer': {
+ 'path': 'v2/capabilities', 'expect': [200], 'auth': True,
+ 'name': 'ceilometer-api'},
+ 'swift': {'path': 'healthcheck', 'expect': [200], 'name': 'swift-api'},
+ 'swift_s3': {
+ 'path': 'healthcheck', 'expect': [200], 'name': 'swift-s3-api'},
+ }
+
+ def _service_url(self, endpoint, path):
+ url = urlparse(endpoint)
+ u = '%s://%s' % (url.scheme, url.netloc)
+ if path != '/':
+ u = '%s/%s' % (u, path)
+ return u
+
+ def check_api(self):
+ """ Check the status of all the API services.
+
+ Yields a list of dict items with 'service', 'status' (either OK,
+ FAIL or UNKNOWN) and 'region' keys.
+ """
+ catalog = self.service_catalog
+ for service in catalog:
+ name = service['name']
+ if name not in self.CHECK_MAP:
+ self.logger.notice(
+ "No check found for service '%s', skipping it" % name)
+ status = self.UNKNOWN
+ else:
+ check = self.CHECK_MAP[name]
+ url = self._service_url(service['url'], check['path'])
+ r = self.raw_get(url, token_required=check.get('auth', False))
+
+ if r is None or r.status_code not in check['expect']:
+ def _status(ret):
+ return 'N/A' if r is None else r.status_code
+
+ self.logger.notice(
+ "Service %s check failed "
+ "(returned '%s' but expected '%s')" % (
+ name, _status(r), check['expect'])
+ )
+ status = self.FAIL
+ else:
+ status = self.OK
+
+ yield {
+ 'service': check.get('name', name),
+ 'status': status,
+ 'region': service['region']
+ }
+
+ def collect(self):
+ for item in self.check_api():
+ if item['status'] == self.UNKNOWN:
+ # skip if status is UNKNOWN
+ continue
+
+ value = collectd.Values(
+ plugin=PLUGIN_NAME,
+ plugin_instance=item['service'],
+ type='gauge',
+ interval=INTERVAL,
+ values=[item['status']],
+ meta={'region': item['region']}
+ )
+ value.dispatch()
+
+
+plugin = APICheckPlugin(collectd, PLUGIN_NAME)
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def notification_callback(notification):
+ plugin.notification_callback(notification)
+
+
+def read_callback():
+ plugin.conditional_read_callback()
+
+collectd.register_config(config_callback)
+collectd.register_notification(notification_callback)
+collectd.register_read(read_callback, INTERVAL)
diff --git a/collectd/files/plugin/collectd_apache_check.py b/collectd/files/plugin/collectd_apache_check.py
new file mode 100644
index 0000000..63ef855
--- /dev/null
+++ b/collectd/files/plugin/collectd_apache_check.py
@@ -0,0 +1,66 @@
+#!/usr/bin/python
+# Copyright 2016 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collectd
+import collectd_base as base
+import requests
+
+NAME = 'apache'
+
+
+class ApacheCheckPlugin(base.Base):
+
+ def __init__(self, *args, **kwargs):
+ super(ApacheCheckPlugin, self).__init__(*args, **kwargs)
+ self.plugin = NAME
+ self.url = None
+
+ def config_callback(self, conf):
+ super(ApacheCheckPlugin, self).config_callback(conf)
+
+ for node in conf.children:
+ if node.key == 'Url':
+ self.url = node.values[0]
+
+ if self.url is None:
+ self.logger.error("{}: Missing Url parameter".format(NAME))
+
+ def read_callback(self):
+ try:
+ requests.get(self.url, timeout=5)
+ self.dispatch_check_metric(self.OK)
+ except Exception as err:
+ msg = "{}: Failed to check service: {}".format(NAME, err)
+ self.logger.error(msg)
+ self.dispatch_check_metric(self.FAIL, msg)
+
+
+plugin = ApacheCheckPlugin(collectd)
+
+
+def init_callback():
+ plugin.restore_sigchld()
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def read_callback():
+ plugin.read_callback()
+
+collectd.register_init(init_callback)
+collectd.register_config(config_callback)
+collectd.register_read(read_callback)
diff --git a/collectd/files/plugin/collectd_base.py b/collectd/files/plugin/collectd_base.py
new file mode 100644
index 0000000..f042628
--- /dev/null
+++ b/collectd/files/plugin/collectd_base.py
@@ -0,0 +1,276 @@
+#!/usr/bin/python
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import wraps
+import json
+import signal
+import subprocess
+import sys
+import time
+import traceback
+
+
+INTERVAL = 10
+
+
+class CheckException(Exception):
+ pass
+
+
+# A decorator that will call the decorated function only when the plugin has
+# detected that it is currently active.
+def read_callback_wrapper(f):
+ @wraps(f)
+ def wrapper(self, *args, **kwargs):
+ if self.do_collect_data:
+ f(self, *args, **kwargs)
+
+ return wrapper
+
+
+class Base(object):
+ """Base class for writing Python plugins."""
+
+ FAIL = 0
+ OK = 1
+ UNKNOWN = 2
+
+ MAX_IDENTIFIER_LENGTH = 63
+
+ def __init__(self, collectd, service_name=None):
+ self.debug = False
+ self.timeout = 5
+ self.max_retries = 3
+ self.logger = collectd
+ self.collectd = collectd
+ self.plugin = None
+ self.plugin_instance = ''
+ # attributes controlling whether the plugin is in collect mode or not
+ self.depends_on_resource = None
+ self.do_collect_data = True
+
+ self.service_name = service_name
+
+ def config_callback(self, conf):
+ for node in conf.children:
+ if node.key == "Debug":
+ if node.values[0] in ['True', 'true']:
+ self.debug = True
+ elif node.key == "Timeout":
+ self.timeout = int(node.values[0])
+ elif node.key == 'MaxRetries':
+ self.max_retries = int(node.values[0])
+ elif node.key == 'DependsOnResource':
+ self.depends_on_resource = node.values[0]
+
+ @read_callback_wrapper
+ def conditional_read_callback(self):
+ self.read_callback()
+
+ def read_callback(self):
+ try:
+ for metric in self.itermetrics():
+ self.dispatch_metric(metric)
+ except CheckException as e:
+ msg = '{}: {}'.format(self.plugin, e)
+ self.logger.warning(msg)
+ self.dispatch_check_metric(self.FAIL, msg)
+ except Exception as e:
+ msg = '{}: Failed to get metrics: {}'.format(self.plugin, e)
+ self.logger.error('{}: {}'.format(msg, traceback.format_exc()))
+ self.dispatch_check_metric(self.FAIL, msg)
+ else:
+ self.dispatch_check_metric(self.OK)
+
+ def dispatch_check_metric(self, check, failure=None):
+ metric = {
+ 'meta': {'service_check': self.service_name or self.plugin},
+ 'values': check,
+ }
+
+ if failure is not None:
+ metric['meta']['failure'] = failure
+
+ self.dispatch_metric(metric)
+
+ def itermetrics(self):
+ """Iterate over the collected metrics
+
+ This class must be implemented by the subclass and should yield dict
+ objects that represent the collected values. Each dict has 6 keys:
+ - 'values', a scalar number or a list of numbers if the type
+ defines several datasources.
+ - 'type_instance' (optional)
+ - 'plugin_instance' (optional)
+ - 'type' (optional, default='gauge')
+ - 'meta' (optional)
+ - 'hostname' (optional)
+
+ For example:
+
+ {'type_instance':'foo', 'values': 1}
+ {'type_instance':'bar', 'type': 'DERIVE', 'values': 1}
+ {'type_instance':'bar', 'type': 'DERIVE', 'values': 1,
+ 'meta': {'tagA': 'valA'}}
+ {'type': 'dropped_bytes', 'values': [1,2]}
+ """
+ raise NotImplemented("Must be implemented by the subclass!")
+
+ def dispatch_metric(self, metric):
+ values = metric['values']
+ if not isinstance(values, list) and not isinstance(values, tuple):
+ values = (values,)
+
+ type_instance = str(metric.get('type_instance', ''))
+ if len(type_instance) > self.MAX_IDENTIFIER_LENGTH:
+ self.logger.warning(
+ '%s: Identifier "%s..." too long (length: %d, max limit: %d)' %
+ (self.plugin, type_instance[:24], len(type_instance),
+ self.MAX_IDENTIFIER_LENGTH))
+
+ v = self.collectd.Values(
+ plugin=self.plugin,
+ host=metric.get('hostname', ''),
+ type=metric.get('type', 'gauge'),
+ plugin_instance=self.plugin_instance,
+ type_instance=type_instance,
+ values=values,
+ # w/a for https://github.com/collectd/collectd/issues/716
+ meta=metric.get('meta', {'0': True})
+ )
+ v.dispatch()
+
+ def execute(self, cmd, shell=True, cwd=None):
+ """Executes a program with arguments.
+
+ Args:
+ cmd: a list of program arguments where the first item is the
+ program name.
+ shell: whether to use the shell as the program to execute (default=
+ True).
+ cwd: the directory to change to before running the program
+ (default=None).
+
+ Returns:
+ A tuple containing the standard output and error strings if the
+ program execution has been successful.
+
+ ("foobar\n", "")
+
+ None if the command couldn't be executed or returned a non-zero
+ status code
+ """
+ start_time = time.time()
+ try:
+ proc = subprocess.Popen(
+ cmd,
+ cwd=cwd,
+ shell=shell,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE
+ )
+ (stdout, stderr) = proc.communicate()
+ stdout = stdout.rstrip('\n')
+ except Exception as e:
+ self.logger.error("Cannot execute command '%s': %s : %s" %
+ (cmd, str(e), traceback.format_exc()))
+ return None
+
+ returncode = proc.returncode
+
+ if returncode != 0:
+ self.logger.error("Command '%s' failed (return code %d): %s" %
+ (cmd, returncode, stderr))
+ return None
+ if self.debug:
+ elapsedtime = time.time() - start_time
+ self.logger.info("Command '%s' returned %s in %0.3fs" %
+ (cmd, returncode, elapsedtime))
+
+ if not stdout and self.debug:
+ self.logger.info("Command '%s' returned no output!", cmd)
+
+ return (stdout, stderr)
+
+ def execute_to_json(self, *args, **kwargs):
+ """Executes a program and decodes the output as a JSON string.
+
+ See execute().
+
+ Returns:
+ A Python object or
+ None if the execution of the program or JSON decoding fails.
+ """
+ outputs = self.execute(*args, **kwargs)
+ if outputs:
+ try:
+ return json.loads(outputs[0])
+ except ValueError as e:
+ self.logger.error("{}: document: '{}'".format(e, outputs[0]))
+
+ @staticmethod
+ def restore_sigchld():
+ """Restores the SIGCHLD handler for Python <= v2.6.
+
+ This should be provided to collectd as the init callback by plugins
+ that execute external programs.
+
+ Note that it will BREAK the exec plugin!!!
+
+ See https://github.com/deniszh/collectd-iostat-python/issues/2 for
+ details.
+ """
+ if sys.version_info[0] == 2 and sys.version_info[1] <= 6:
+ signal.signal(signal.SIGCHLD, signal.SIG_DFL)
+
+ def notification_callback(self, notification):
+ if not self.depends_on_resource:
+ return
+
+ try:
+ data = json.loads(notification.message)
+ except ValueError:
+ return
+
+ if 'value' not in data:
+ self.logger.warning(
+ "%s: missing 'value' in notification" %
+ self.__class__.__name__)
+ elif 'resource' not in data:
+ self.logger.warning(
+ "%s: missing 'resource' in notification" %
+ self.__class__.__name__)
+ elif data['resource'] == self.depends_on_resource:
+ do_collect_data = data['value'] > 0
+ if self.do_collect_data != do_collect_data:
+ # log only the transitions
+ self.logger.notice("%s: do_collect_data=%s" %
+ (self.__class__.__name__, do_collect_data))
+ self.do_collect_data = do_collect_data
+
+
+class CephBase(Base):
+
+ def __init__(self, *args, **kwargs):
+ super(CephBase, self).__init__(*args, **kwargs)
+ self.cluster = 'ceph'
+
+ def config_callback(self, conf):
+ super(CephBase, self).config_callback(conf)
+
+ for node in conf.children:
+ if node.key == "Cluster":
+ self.cluster = node.values[0]
+ self.plugin_instance = self.cluster
diff --git a/collectd/files/plugin/collectd_libvirt_check.py b/collectd/files/plugin/collectd_libvirt_check.py
new file mode 100644
index 0000000..4660609
--- /dev/null
+++ b/collectd/files/plugin/collectd_libvirt_check.py
@@ -0,0 +1,65 @@
+#!/usr/bin/python
+# Copyright 2016 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collectd
+import libvirt
+
+import collectd_base as base
+
+NAME = 'libvirt'
+URI = 'qemu:///system'
+
+
+class LibvirtCheckPlugin(base.Base):
+
+ def __init__(self, *args, **kwargs):
+ super(LibvirtCheckPlugin, self).__init__(*args, **kwargs)
+ self.plugin = NAME
+ self.uri = URI
+
+ def config_callback(self, conf):
+ super(LibvirtCheckPlugin, self).config_callback(conf)
+
+ for node in conf.children:
+ if node.key == 'Uri':
+ self.uri = node.values[0]
+
+ def read_callback(self):
+ try:
+ cnx = libvirt.openReadOnly(self.uri)
+ cnx.numOfDefinedDomains()
+ self.dispatch_check_metric(self.OK)
+ except libvirt.libvirtError as e:
+ msg = 'Fail to query libvirt ({}): {}'.format(self.uri, e)
+ self.dispatch_check_metric(self.FAIL, msg)
+
+
+plugin = LibvirtCheckPlugin(collectd)
+
+
+def init_callback():
+ plugin.restore_sigchld()
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def read_callback():
+ plugin.read_callback()
+
+collectd.register_init(init_callback)
+collectd.register_config(config_callback)
+collectd.register_read(read_callback)
diff --git a/collectd/files/plugin/collectd_memcached_check.py b/collectd/files/plugin/collectd_memcached_check.py
new file mode 100644
index 0000000..fb44aeb
--- /dev/null
+++ b/collectd/files/plugin/collectd_memcached_check.py
@@ -0,0 +1,76 @@
+#!/usr/bin/python
+# Copyright 2016 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collectd
+
+import collectd_base as base
+
+from pymemcache.client import base as memcache
+
+NAME = 'memcached'
+
+
+class MemcachedCheckPlugin(base.Base):
+
+ def __init__(self, *args, **kwargs):
+ super(MemcachedCheckPlugin, self).__init__(*args, **kwargs)
+ self.plugin = NAME
+ self.host = None
+ self.port = 11211
+
+ def config_callback(self, conf):
+ super(MemcachedCheckPlugin, self).config_callback(conf)
+
+ for node in conf.children:
+ if node.key == 'Host':
+ self.host = node.values[0]
+ if node.key == 'Port':
+ # Must coerce to integer to avoid getting a float value.
+ self.port = int(node.values[0])
+
+ if self.host is None:
+ self.logger.error('Missing Host parameter')
+
+ def read_callback(self):
+ try:
+
+ mc = memcache.Client((self.host, self.port))
+ mc.get('__get_non_existent_key__')
+ self.dispatch_check_metric(self.OK)
+ except Exception as e:
+ msg = 'Fail to query memcached ({}:{}): {}'.format(self.host,
+ self.port,
+ e)
+ self.logger.error(msg)
+ self.dispatch_check_metric(self.FAIL, msg)
+
+
+plugin = MemcachedCheckPlugin(collectd)
+
+
+def init_callback():
+ plugin.restore_sigchld()
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def read_callback():
+ plugin.read_callback()
+
+collectd.register_init(init_callback)
+collectd.register_config(config_callback)
+collectd.register_read(read_callback)
diff --git a/collectd/files/plugin/collectd_mysql_check.py b/collectd/files/plugin/collectd_mysql_check.py
new file mode 100644
index 0000000..a42414c
--- /dev/null
+++ b/collectd/files/plugin/collectd_mysql_check.py
@@ -0,0 +1,119 @@
+#!/usr/bin/python
+# Copyright 2016 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collectd
+
+import collectd_base as base
+import os.path
+import pymysql
+
+NAME = 'mysql'
+
+
+class MySQLCheckPlugin(base.Base):
+
+ def __init__(self, *args, **kwargs):
+ super(MySQLCheckPlugin, self).__init__(*args, **kwargs)
+ self.plugin = NAME
+ self.host = None
+ self.socket = None
+ self.port = '3306'
+ self.sql = 'SELECT VERSION()'
+ self.username = None
+ self.password = None
+ self.database = None
+
+ def config_callback(self, conf):
+ super(MySQLCheckPlugin, self).config_callback(conf)
+
+ for node in conf.children:
+ if node.key == 'Socket':
+ self.socket = node.values[0]
+ if node.key == 'Host':
+ self.host = node.values[0]
+ if node.key == 'Port':
+ self.port = int(node.values[0])
+ if node.key == 'Username':
+ self.username = node.values[0]
+ if node.key == 'Password':
+ self.password = node.values[0]
+ if node.key == 'Database': # Optional
+ self.database = node.values[0]
+ if node.key == 'SQL':
+ self.sql = node.values[0]
+
+ if not self.socket and not self.host:
+ # Try to find MySQL socket
+ for sock in ('/var/run/mysqld/mysqld.sock',
+ '/run/mysqld/mysqld.sock'):
+ if os.path.exists(sock):
+ self.socket = sock
+ self.logger.info('Use socket {} as a fallback'.format(
+ sock))
+ break
+
+ if not self.socket:
+ self.logger.error('Missing parameter: Host or Socket')
+
+ if self.socket:
+ # The hostname must be set to localhost to work with socket
+ self.host = 'localhost'
+ self.logger.info('Use Socket={}'.format(self.socket))
+
+ if not self.username:
+ self.logger.warning('Missing parameter Username')
+
+ def read_callback(self):
+ cnx = None
+ try:
+ cnx = pymysql.connect(host=self.host,
+ port=self.port,
+ unix_socket=self.socket,
+ user=self.username,
+ password=self.password,
+ db=self.database,
+ connect_timeout=3)
+
+ with cnx.cursor() as cursor:
+ cursor.execute(self.sql)
+ cursor.fetchone()
+ self.dispatch_check_metric(self.OK)
+ except Exception as e:
+ msg = 'Fail to query MySQL "{}": {}'.format(self.sql, e)
+ self.logger.error(msg)
+ self.dispatch_check_metric(self.FAIL, msg)
+
+ finally:
+ if cnx is not None:
+ cnx.close()
+
+
+plugin = MySQLCheckPlugin(collectd)
+
+
+def init_callback():
+ plugin.restore_sigchld()
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def read_callback():
+ plugin.read_callback()
+
+collectd.register_init(init_callback)
+collectd.register_config(config_callback)
+collectd.register_read(read_callback)
diff --git a/collectd/files/plugin/collectd_openstack.py b/collectd/files/plugin/collectd_openstack.py
new file mode 100644
index 0000000..5539338
--- /dev/null
+++ b/collectd/files/plugin/collectd_openstack.py
@@ -0,0 +1,349 @@
+#!/usr/bin/python
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import dateutil.parser
+import dateutil.tz
+import requests
+import simplejson as json
+import traceback
+
+import collectd_base as base
+
+from collections import defaultdict
+
+# By default, query OpenStack API endpoints every 50 seconds. We choose a value
+# less than the default group by interval (which is 60 seconds) to avoid gaps
+# in the Grafana graphs.
+INTERVAL = 50
+
+
+class KeystoneException(Exception):
+ pass
+
+
+class OSClient(object):
+ """ Base class for querying the OpenStack API endpoints.
+
+ It uses the Keystone service catalog to discover the API endpoints.
+ """
+ EXPIRATION_TOKEN_DELTA = datetime.timedelta(0, 30)
+
+ def __init__(self, username, password, tenant, keystone_url, timeout,
+ logger, max_retries):
+ self.logger = logger
+ self.username = username
+ self.password = password
+ self.tenant_name = tenant
+ self.keystone_url = keystone_url
+ self.service_catalog = []
+ self.tenant_id = None
+ self.timeout = timeout
+ self.token = None
+ self.valid_until = None
+
+ # Note: prior to urllib3 v1.9, retries are made on failed connections
+ # but not on timeout and backoff time is not supported.
+ # (at this time we ship requests 2.2.1 and urllib3 1.6.1 or 1.7.1)
+ self.session = requests.Session()
+ self.session.mount(
+ 'http://', requests.adapters.HTTPAdapter(max_retries=max_retries))
+ self.session.mount(
+ 'https://', requests.adapters.HTTPAdapter(max_retries=max_retries))
+
+ def is_valid_token(self):
+ now = datetime.datetime.now(tz=dateutil.tz.tzutc())
+ return self.token and self.valid_until and self.valid_until > now
+
+ def clear_token(self):
+ self.token = None
+ self.valid_until = None
+
+ def get_token(self):
+ self.clear_token()
+ data = json.dumps({
+ "auth":
+ {
+ 'tenantName': self.tenant_name,
+ 'passwordCredentials':
+ {
+ 'username': self.username,
+ 'password': self.password
+ }
+ }
+ })
+ self.logger.info("Trying to get token from '%s'" % self.keystone_url)
+ r = self.make_request('post',
+ '%s/tokens' % self.keystone_url, data=data,
+ token_required=False)
+ if not r:
+ raise KeystoneException("Cannot get a valid token from %s" %
+ self.keystone_url)
+
+ if r.status_code < 200 or r.status_code > 299:
+ raise KeystoneException("%s responded with code %d" %
+ (self.keystone_url, r.status_code))
+
+ data = r.json()
+ self.logger.debug("Got response from Keystone: '%s'" % data)
+ self.token = data['access']['token']['id']
+ self.tenant_id = data['access']['token']['tenant']['id']
+ self.valid_until = dateutil.parser.parse(
+ data['access']['token']['expires']) - self.EXPIRATION_TOKEN_DELTA
+ self.service_catalog = []
+ for item in data['access']['serviceCatalog']:
+ endpoint = item['endpoints'][0]
+ self.service_catalog.append({
+ 'name': item['name'],
+ 'region': endpoint['region'],
+ 'service_type': item['type'],
+ 'url': endpoint['internalURL'],
+ 'admin_url': endpoint['adminURL'],
+ })
+
+ self.logger.debug("Got token '%s'" % self.token)
+ return self.token
+
+ def make_request(self, verb, url, data=None, token_required=True):
+ kwargs = {
+ 'url': url,
+ 'timeout': self.timeout,
+ 'headers': {'Content-type': 'application/json'}
+ }
+ if token_required and not self.is_valid_token() and \
+ not self.get_token():
+ self.logger.error("Aborting request, no valid token")
+ return
+ elif token_required:
+ kwargs['headers']['X-Auth-Token'] = self.token
+
+ if data is not None:
+ kwargs['data'] = data
+
+ func = getattr(self.session, verb.lower())
+
+ try:
+ r = func(**kwargs)
+ except Exception as e:
+ self.logger.error("Got exception for '%s': '%s'" %
+ (kwargs['url'], e))
+ return
+
+ self.logger.info("%s responded with status code %d" %
+ (kwargs['url'], r.status_code))
+ if r.status_code == 401:
+ # Clear token in case it is revoked or invalid
+ self.clear_token()
+
+ return r
+
+
+class CollectdPlugin(base.Base):
+
+ def __init__(self, *args, **kwargs):
+ super(CollectdPlugin, self).__init__(*args, **kwargs)
+ # The timeout/max_retries are defined according to the observations on
+ # 200 nodes environments with 600 VMs. See #1554502 for details.
+ self.timeout = 20
+ self.max_retries = 2
+ self.os_client = None
+ self.extra_config = {}
+
+ def _build_url(self, service, resource):
+ s = (self.get_service(service) or {})
+ # the adminURL must be used to access resources with Keystone API v2
+ if service == 'keystone' and \
+ (resource in ['tenants', 'users'] or 'OS-KS' in resource):
+ url = s.get('admin_url')
+ else:
+ url = s.get('url')
+
+ if url:
+ if url[-1] != '/':
+ url += '/'
+ url = "%s%s" % (url, resource)
+ else:
+ self.logger.error("Service '%s' not found in catalog" % service)
+ return url
+
+ def raw_get(self, url, token_required=False):
+ return self.os_client.make_request('get', url,
+ token_required=token_required)
+
+ def iter_workers(self, service):
+ """ Return the list of workers and their state
+
+ Here is an example of returned dictionnary:
+ {
+ 'host': 'node.example.com',
+ 'service': 'nova-compute',
+ 'state': 'up'
+ }
+
+ where 'state' can be 'up', 'down' or 'disabled'
+ """
+
+ if service == 'neutron':
+ endpoint = 'v2.0/agents'
+ entry = 'agents'
+ else:
+ endpoint = 'os-services'
+ entry = 'services'
+
+ ost_services_r = self.get(service, endpoint)
+
+ msg = "Cannot get state of {} workers".format(service)
+ if ost_services_r is None:
+ self.logger.warning(msg)
+ elif ost_services_r.status_code != 200:
+ msg = "{}: Got {} ({})".format(
+ msg, ost_services_r.status_code, ost_services_r.content)
+ self.logger.warning(msg)
+ else:
+ try:
+ r_json = ost_services_r.json()
+ except ValueError:
+ r_json = {}
+
+ if entry not in r_json:
+ msg = "{}: couldn't find '{}' key".format(msg, entry)
+ self.logger.warning(msg)
+ else:
+ for val in r_json[entry]:
+ data = {'host': val['host'], 'service': val['binary']}
+
+ if service == 'neutron':
+ if not val['admin_state_up']:
+ data['state'] = 'disabled'
+ else:
+ data['state'] = 'up' if val['alive'] else 'down'
+ else:
+ if val['status'] == 'disabled':
+ data['state'] = 'disabled'
+ elif val['state'] == 'up' or val['state'] == 'down':
+ data['state'] = val['state']
+ else:
+ msg = "Unknown state for {} workers:{}".format(
+ service, val['state'])
+ self.logger.warning(msg)
+ continue
+
+ yield data
+
+ def get(self, service, resource):
+ url = self._build_url(service, resource)
+ if not url:
+ return
+ self.logger.info("GET '%s'" % url)
+ return self.os_client.make_request('get', url)
+
+ @property
+ def service_catalog(self):
+ if not self.os_client.service_catalog:
+ # In case the service catalog is empty (eg Keystone was down when
+ # collectd started), we should try to get a new token
+ self.os_client.get_token()
+ return self.os_client.service_catalog
+
+ def get_service(self, service_name):
+ return next((x for x in self.service_catalog
+ if x['name'] == service_name), None)
+
+ def config_callback(self, config):
+ super(CollectdPlugin, self).config_callback(config)
+ for node in config.children:
+ if node.key == 'Username':
+ username = node.values[0]
+ elif node.key == 'Password':
+ password = node.values[0]
+ elif node.key == 'Tenant':
+ tenant_name = node.values[0]
+ elif node.key == 'KeystoneUrl':
+ keystone_url = node.values[0]
+ self.os_client = OSClient(username, password, tenant_name,
+ keystone_url, self.timeout, self.logger,
+ self.max_retries)
+
+ def read_callback(self):
+ """ Wrapper method
+
+ This method calls the actual method which performs
+ collection.
+ """
+
+ try:
+ self.collect()
+ except Exception as e:
+ msg = '{}: fail to get metrics: {}: {}'.format(
+ self.service_name or self.plugin, e, traceback.format_exc())
+ self.logger.error(msg)
+
+ def collect(self):
+ """ Read metrics and dispatch values
+
+ This method should be overriden by the derived classes.
+ """
+
+ raise 'collect() method needs to be overriden!'
+
+ def get_objects(self, project, object_name, api_version='',
+ params='all_tenants=1'):
+ """ Return a list of OpenStack objects
+
+ See get_objects_details()
+ """
+ return self._get_objects(project, object_name, api_version, params,
+ False)
+
+ def get_objects_details(self, project, object_name, api_version='',
+ params='all_tenants=1'):
+ """ Return a list of details about OpenStack objects
+
+ The API version is not always included in the URL endpoint
+ registered in Keystone (eg Glance). In this case, use the
+ api_version parameter to specify which version should be used.
+ """
+ return self._get_objects(project, object_name, api_version, params,
+ True)
+
+ def _get_objects(self, project, object_name, api_version, params, detail):
+ if api_version:
+ resource = '%s/%s' % (api_version, object_name)
+ else:
+ resource = '%s' % (object_name)
+ if detail:
+ resource = '%s/detail' % (resource)
+ if params:
+ resource = '%s?%s' % (resource, params)
+ # TODO(scroiset): use pagination to handle large collection
+ r = self.get(project, resource)
+ if not r or object_name not in r.json():
+ self.logger.warning('Could not find %s %s' % (project,
+ object_name))
+ return []
+ return r.json().get(object_name)
+
+ def count_objects_group_by(self,
+ list_object,
+ group_by_func,
+ count_func=None):
+
+ """ Count the number of items grouped by arbitrary criteria."""
+
+ counts = defaultdict(int)
+ for obj in list_object:
+ s = group_by_func(obj)
+ counts[s] += count_func(obj) if count_func else 1
+ return counts
diff --git a/collectd/files/plugin/collectd_pacemaker.py b/collectd/files/plugin/collectd_pacemaker.py
new file mode 100644
index 0000000..682c100
--- /dev/null
+++ b/collectd/files/plugin/collectd_pacemaker.py
@@ -0,0 +1,301 @@
+#!/usr/bin/python
+# Copyright 2016 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collectd
+from collections import Counter
+from collections import defaultdict
+from sets import Set
+import socket
+import xml.etree.ElementTree as ET
+
+import collectd_base as base
+
+NAME = 'pacemaker'
+CRM_MON_BINARY = '/usr/sbin/crm_mon'
+
+# Node status
+OFFLINE_STATUS = 0
+MAINTENANCE_STATUS = 1
+ONLINE_STATUS = 2
+
+
+class CrmMonitorPlugin(base.Base):
+
+ def __init__(self, *args, **kwargs):
+ super(CrmMonitorPlugin, self).__init__(*args, **kwargs)
+ self.plugin = NAME
+ self.crm_mon_binary = CRM_MON_BINARY
+ self.hostname = socket.getfqdn()
+ self.notify_resource = None
+ self.resources = {}
+ self.history = {}
+
+ def config_callback(self, conf):
+ super(CrmMonitorPlugin, self).config_callback(conf)
+
+ for node in conf.children:
+ if node.key == 'Hostname':
+ self.hostname = node.values[0]
+ elif node.key == 'CrmMonBinary':
+ self.crm_mon_binary = node.values[0]
+ elif node.key == 'Resource':
+ self.resources[node.values[0]] = node.values[-1]
+ elif node.key == 'NotifyResource':
+ self.notify_resource = node.values[0]
+
+ def itermetrics(self):
+ def str_to_bool(v):
+ return str(v).lower() == 'true'
+
+ def str_to_boolint(v):
+ if str_to_bool(v):
+ return 1
+ else:
+ return 0
+
+ def shorten_hostname(v):
+ return v.split('.')[0]
+
+ def same_hostname(v):
+ if v is not None and v.get('name') == self.hostname:
+ return 1
+ return 0
+
+ out, err = self.execute([self.crm_mon_binary, '--as-xml', '-r', '-f'],
+ shell=False)
+ if not out:
+ raise base.CheckException(
+ "Failed to execute crm_mon '{}'".format(err))
+
+ try:
+ root = ET.fromstring(out)
+ except ET.ParseError:
+ raise base.CheckException(
+ "Failed to parse XML '{}'".format(out[:64]))
+
+ if self.notify_resource:
+ # Notify the other collectd plugins whether the resource runs
+ # locally or not
+ node = root.find('resources/resource[@id="{}"]/node'.format(
+ self.notify_resource))
+ self.collectd.Notification(
+ type='gauge',
+ message='{{"resource":"{}","value":{}}}'.format(
+ self.notify_resource, same_hostname(node)),
+ severity=self.collectd.NOTIF_OKAY
+ ).dispatch()
+ # The metric needs to be emitted too for the Lua plugins executed
+ # by the metric_collector service
+ yield {
+ 'type_instance': 'local_resource_active',
+ 'values': same_hostname(node),
+ 'meta': {'resource': self.notify_resource}
+ }
+
+ summary = root.find('summary')
+ current_dc = summary.find('current_dc')
+ # The metric needs to be emitted for the alarms that leverage the other
+ # metrics emitted by the plugin
+ yield {
+ 'type_instance': 'local_dc_active',
+ 'values': same_hostname(current_dc),
+ }
+
+ if current_dc.get('name') != self.hostname:
+ # The other metrics are only collected from the cluster's DC
+ return
+
+ # Report global cluster metrics
+ yield {
+ 'type_instance': 'dc',
+ 'values': str_to_boolint(current_dc.get('present', 'false'))
+ }
+
+ yield {
+ 'type_instance': 'quorum_status',
+ 'values': str_to_boolint(current_dc.get('with_quorum', 'false'))
+ }
+ yield {
+ 'type_instance': 'configured_nodes',
+ 'values': int(summary.find('nodes_configured').get('number'))
+ }
+ yield {
+ 'type_instance': 'configured_resources',
+ 'values': int(summary.find('resources_configured').get('number'))
+ }
+
+ # Report node status metrics
+ cluster_nodes = []
+ aggregated_nodes_status = {'online': 0, 'offline': 0, 'maintenance': 0}
+ nodes_total = 0
+ for node in root.find('nodes').iter('node'):
+ nodes_total += 1
+ hostname = shorten_hostname(node.get('name'))
+ cluster_nodes.append(node.get('name'))
+ if str_to_bool(node.get('online')):
+ if str_to_bool(node.get('maintenance')):
+ aggregated_nodes_status['maintenance'] += 1
+ yield {
+ 'type_instance': 'node_status',
+ 'values': MAINTENANCE_STATUS,
+ 'meta': {'status': 'maintenance', 'host': hostname}
+ }
+ else:
+ aggregated_nodes_status['online'] += 1
+ yield {
+ 'type_instance': 'node_status',
+ 'values': ONLINE_STATUS,
+ 'meta': {'status': 'online', 'host': hostname}
+ }
+ else:
+ aggregated_nodes_status['offline'] += 1
+ yield {
+ 'type_instance': 'node_status',
+ 'values': OFFLINE_STATUS,
+ 'meta': {'status': 'offline', 'host': hostname}
+ }
+
+ for status, cnt in aggregated_nodes_status.items():
+ yield {
+ 'type_instance': 'nodes_count',
+ 'values': cnt,
+ 'meta': {'status': status}
+ }
+ yield {
+ 'type_instance': 'nodes_percent',
+ 'values': 100.0 * cnt / nodes_total,
+ 'meta': {'status': status}
+ }
+
+ # Report the number of resources per status
+ # Clone resources can run on multipe nodes while "simple" resources run
+ # only one node at the same time
+ aggregated_resources = defaultdict(Counter)
+ resources = root.find('resources')
+ for resource_id, resource_name in self.resources.iteritems():
+ resource_elts = []
+ simple_resource = None
+ clone_resource = resources.find(
+ 'clone/resource[@id="{}"]/..'.format(resource_id))
+ if not clone_resource:
+ simple_resource = resources.find('resource[@id="{}"]'.format(
+ resource_id))
+ if simple_resource:
+ resource_elts = [simple_resource]
+ else:
+ resource_elts = clone_resource.findall('resource')
+
+ if not resource_elts:
+ self.logger.error("{}: Couldn't find resource '{}'".format(
+ self.plugin, resource_id))
+ continue
+
+ total = 0
+ for item in resource_elts:
+ total += 1
+ if (item.get('role') in ('Slave', 'Master') and
+ not str_to_bool(item.get('failed'))):
+ # Multi-master resource
+ aggregated_resources[resource_name]['up'] += 1
+ elif item.get('role') == 'Started':
+ aggregated_resources[resource_name]['up'] += 1
+ else:
+ aggregated_resources[resource_name]['down'] += 1
+
+ if simple_resource:
+ # Report on which node the "simple" resource is running
+ for node in cluster_nodes:
+ yield {
+ 'type_instance': 'local_resource_active',
+ 'values': str_to_boolint(
+ node == simple_resource.find('node').get('name')),
+ 'meta': {'resource': resource_name,
+ 'host': shorten_hostname(node)}
+ }
+
+ for status in ('up', 'down'):
+ cnt = aggregated_resources[resource_name][status]
+ yield {
+ 'type_instance': 'resource_count',
+ 'values': cnt,
+ 'meta': {'status': status, 'resource': resource_name}
+ }
+ yield {
+ 'type_instance': 'resource_percent',
+ 'values': 100.0 * cnt / total,
+ 'meta': {'status': status, 'resource': resource_name}
+ }
+
+ # Collect operations' history metrics for the monitored resources
+ #
+ # The reported count for the resource's operations is an approximate
+ # value because crm_mon doesn't provide the exact number. To estimate
+ # the number of operations applied to a resource, the plugin keeps a
+ # copy of call_ids and compares it with the current value.
+ for node in root.find('node_history').iter('node'):
+ hostname = shorten_hostname(node.get('name'))
+ if hostname not in self.history:
+ self.history[hostname] = {}
+
+ for resource_id, resource_name in self.resources.iteritems():
+ if resource_id not in self.history[hostname]:
+ self.history[hostname][resource_id] = {
+ 'fail_count': 0,
+ 'ops_count': 0,
+ 'call_ids': Set([])
+ }
+ v = self.history[hostname][resource_id]
+
+ res_history = node.find('resource_history[@id="{}"]'.format(
+ resource_id))
+ if res_history:
+ # For simple resources, the resource_history element only
+ # exists for the node that runs the resource
+ v['fail_count'] += int(res_history.get('fail-count', 0))
+ call_ids = Set([
+ i.get('call') for i in res_history.findall(
+ 'operation_history')])
+ if call_ids:
+ v['ops_count'] += len(call_ids - v['call_ids'])
+ v['call_ids'] = call_ids
+
+ yield {
+ 'type_instance': 'resource_failures',
+ 'values': v['fail_count'],
+ 'meta': {'resource': resource_name, 'host': hostname}
+ }
+ yield {
+ 'type_instance': 'resource_operations',
+ 'values': v['ops_count'],
+ 'meta': {'resource': resource_name, 'host': hostname}
+ }
+
+
+plugin = CrmMonitorPlugin(collectd)
+
+
+def init_callback():
+ plugin.restore_sigchld()
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def read_callback():
+ plugin.read_callback()
+
+collectd.register_config(config_callback)
+collectd.register_read(read_callback)
diff --git a/collectd/files/plugin/elasticsearch_cluster.py b/collectd/files/plugin/elasticsearch_cluster.py
new file mode 100644
index 0000000..c3dcf37
--- /dev/null
+++ b/collectd/files/plugin/elasticsearch_cluster.py
@@ -0,0 +1,108 @@
+#!/usr/bin/python
+# Copyright 2016 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collectd
+import requests
+
+import collectd_base as base
+
+NAME = 'elasticsearch_cluster'
+HEALTH_MAP = {
+ 'green': 1,
+ 'yellow': 2,
+ 'red': 3,
+}
+METRICS = ['number_of_nodes', 'active_primary_shards', 'active_primary_shards',
+ 'active_shards', 'relocating_shards', 'unassigned_shards',
+ 'number_of_pending_tasks', 'initializing_shards']
+
+
+class ElasticsearchClusterHealthPlugin(base.Base):
+ def __init__(self, *args, **kwargs):
+ super(ElasticsearchClusterHealthPlugin, self).__init__(*args, **kwargs)
+ self.plugin = NAME
+ self.address = '127.0.0.1'
+ self.port = 9200
+ self.session = requests.Session()
+ self.url = None
+ self.session.mount(
+ 'http://',
+ requests.adapters.HTTPAdapter(max_retries=self.max_retries)
+ )
+
+ def config_callback(self, conf):
+ super(ElasticsearchClusterHealthPlugin, self).config_callback(conf)
+
+ for node in conf.children:
+ if node.key == 'Address':
+ self.address = node.values[0]
+ if node.key == 'Port':
+ self.port = node.values[0]
+
+ self.url = "http://{address}:{port}/_cluster/health".format(
+ **{
+ 'address': self.address,
+ 'port': int(self.port),
+ })
+
+ def itermetrics(self):
+ try:
+ r = self.session.get(self.url)
+ except Exception as e:
+ msg = "Got exception for '{}': {}".format(self.url, e)
+ raise base.CheckException(msg)
+
+ if r.status_code != 200:
+ msg = "{} responded with code {}".format(
+ self.url, r.status_code)
+ raise base.CheckException(msg)
+
+ data = r.json()
+ self.logger.debug("Got response from Elasticsearch: '%s'" % data)
+
+ yield {
+ 'type_instance': 'health',
+ 'values': HEALTH_MAP[data['status']]
+ }
+
+ for metric in METRICS:
+ value = data.get(metric)
+ if value is None:
+ # Depending on the Elasticsearch version, not all metrics are
+ # available
+ self.logger.info("Couldn't find {} metric".format(metric))
+ continue
+ yield {
+ 'type_instance': metric,
+ 'values': value
+ }
+
+plugin = ElasticsearchClusterHealthPlugin(collectd, 'elasticsearch')
+
+
+def init_callback():
+ plugin.restore_sigchld()
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def read_callback():
+ plugin.read_callback()
+
+collectd.register_init(init_callback)
+collectd.register_config(config_callback)
+collectd.register_read(read_callback)
diff --git a/collectd/files/plugin/haproxy.py b/collectd/files/plugin/haproxy.py
new file mode 100644
index 0000000..17514e2
--- /dev/null
+++ b/collectd/files/plugin/haproxy.py
@@ -0,0 +1,307 @@
+# haproxy-collectd-plugin - haproxy.py
+#
+# Original Author: Michael Leinartas
+# Substantial additions by Mirantis
+# Description: This is a collectd plugin which runs under the Python plugin to
+# collect metrics from haproxy.
+# Plugin structure and logging func taken from
+# https://github.com/phrawzty/rabbitmq-collectd-plugin
+
+# Copyright (c) 2011 Michael Leinartas
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import collectd
+import csv
+import itertools
+import socket
+
+import collectd_base as base
+
+from collections import defaultdict
+
+NAME = 'haproxy'
+RECV_SIZE = 1024
+SERVER_METRICS = {
+ 'CurrConns': ('connections', 'gauge'),
+ 'CurrSslConns': ('ssl_connections', 'gauge'),
+ 'PipesUsed': ('pipes_used', 'gauge'),
+ 'PipesFree': ('pipes_free', 'gauge'),
+ 'Run_queue': ('run_queue', 'gauge'),
+ 'Tasks': ('tasks', 'gauge'),
+ 'Uptime_sec': ('uptime', 'gauge'),
+}
+FRONTEND_METRIC_TYPES = {
+ 'bin': ('bytes_in', 'gauge'),
+ 'bout': ('bytes_out', 'gauge'),
+ 'dresp': ('denied_responses', 'gauge'),
+ 'dreq': ('denied_requests', 'gauge'),
+ 'ereq': ('error_requests', 'gauge'),
+ 'hrsp_1xx': ('response_1xx', 'gauge'),
+ 'hrsp_2xx': ('response_2xx', 'gauge'),
+ 'hrsp_3xx': ('response_3xx', 'gauge'),
+ 'hrsp_4xx': ('response_4xx', 'gauge'),
+ 'hrsp_5xx': ('response_5xx', 'gauge'),
+ 'hrsp_other': ('response_other', 'gauge'),
+ 'stot': ('session_total', 'gauge'),
+ 'scur': ('session_current', 'gauge'),
+}
+BACKEND_METRIC_TYPES = {
+ 'bin': ('bytes_in', 'gauge'),
+ 'bout': ('bytes_out', 'gauge'),
+ 'downtime': ('downtime', 'gauge'),
+ 'dresp': ('denied_responses', 'gauge'),
+ 'dreq': ('denied_requests', 'gauge'),
+ 'econ': ('error_connection', 'gauge'),
+ 'eresp': ('error_responses', 'gauge'),
+ 'hrsp_1xx': ('response_1xx', 'gauge'),
+ 'hrsp_2xx': ('response_2xx', 'gauge'),
+ 'hrsp_3xx': ('response_3xx', 'gauge'),
+ 'hrsp_4xx': ('response_4xx', 'gauge'),
+ 'hrsp_5xx': ('response_5xx', 'gauge'),
+ 'hrsp_other': ('response_other', 'gauge'),
+ 'qcur': ('queue_current', 'gauge'),
+ 'stot': ('session_total', 'gauge'),
+ 'scur': ('session_current', 'gauge'),
+ 'wredis': ('redistributed', 'gauge'),
+ 'wretr': ('retries', 'gauge'),
+ 'status': ('status', 'gauge'),
+}
+
+STATUS_MAP = {
+ 'DOWN': 0,
+ 'UP': 1,
+}
+
+FRONTEND_TYPE = '0'
+BACKEND_TYPE = '1'
+BACKEND_SERVER_TYPE = '2'
+
+HAPROXY_SOCKET = '/var/lib/haproxy/stats'
+DEFAULT_PROXY_MONITORS = ['server', 'frontend', 'backend', 'backend_server']
+
+
+class HAProxySocket(object):
+ def __init__(self, socket_file):
+ self.socket_file = socket_file
+
+ def connect(self):
+ s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ s.connect(self.socket_file)
+ return s
+
+ def communicate(self, command):
+ '''Send a command to the socket and return a response (raw string).'''
+
+ s = self.connect()
+ if not command.endswith('\n'):
+ command += '\n'
+ s.send(command)
+ result = ''
+ buf = ''
+ buf = s.recv(RECV_SIZE)
+ while buf:
+ result += buf
+ buf = s.recv(RECV_SIZE)
+ s.close()
+ return result
+
+ def get_server_info(self):
+ result = {}
+ output = self.communicate('show info')
+ for line in output.splitlines():
+ try:
+ key, val = line.split(':')
+ except ValueError:
+ continue
+ result[key.strip()] = val.strip()
+ return result
+
+ def get_server_stats(self):
+ output = self.communicate('show stat')
+ # sanitize and make a list of lines
+ output = output.lstrip('# ').strip()
+ output = [l.strip(',') for l in output.splitlines()]
+ csvreader = csv.DictReader(output)
+ result = [d.copy() for d in csvreader]
+ return result
+
+
+class HAProxyPlugin(base.Base):
+ def __init__(self, *args, **kwargs):
+ super(HAProxyPlugin, self).__init__(*args, **kwargs)
+ self.plugin = NAME
+ self.names_mapping = {}
+ self.proxy_monitors = []
+ self.proxy_ignore = []
+ self.socket = HAPROXY_SOCKET
+
+ def get_proxy_name(self, pxname):
+ if pxname not in self.names_mapping:
+ self.logger.info('Mapping missing for "%s"' % pxname)
+ return self.names_mapping.get(pxname, pxname)
+
+ def itermetrics(self):
+ haproxy = HAProxySocket(self.socket)
+
+ # Collect server statistics
+ if 'server' in self.proxy_monitors:
+ try:
+ stats = haproxy.get_server_info()
+ except socket.error:
+ msg = "Unable to connect to HAProxy socket at {}".format(
+ self.socket)
+ raise base.CheckException(msg)
+ else:
+ for k, v in stats.iteritems():
+ if k not in SERVER_METRICS:
+ continue
+ type_instance = SERVER_METRICS[k][0]
+ type_ = SERVER_METRICS[k][1]
+ yield {
+ 'type_instance': type_instance,
+ 'type': type_,
+ 'values': int(v),
+ }
+
+ try:
+ stats = haproxy.get_server_stats()
+ except socket.error:
+ msg = "Unable to connect to HAProxy socket at {}".format(
+ self.socket)
+ raise base.CheckException(msg)
+
+ def match(x):
+ if x['pxname'] in self.proxy_ignore:
+ return False
+ return (x['svname'].lower() in self.proxy_monitors or
+ x['pxname'].lower() in self.proxy_monitors or
+ ('backend_server' in self.proxy_monitors and
+ x['type'] == BACKEND_SERVER_TYPE))
+ stats = filter(match, stats)
+ for stat in stats:
+ stat['pxname'] = self.get_proxy_name(stat['pxname'])
+
+ # Collect statistics for the frontends and the backends
+ for stat in itertools.ifilter(lambda x: x['type'] == FRONTEND_TYPE or
+ x['type'] == BACKEND_TYPE, stats):
+ if stat['type'] == FRONTEND_TYPE:
+ metrics = FRONTEND_METRIC_TYPES
+ side = 'frontend'
+ else:
+ metrics = BACKEND_METRIC_TYPES
+ side = 'backend'
+ for k, metric in metrics.iteritems():
+ if k not in stat:
+ self.logger.warning("Can't find {} metric".format(k))
+ continue
+ value = stat[k]
+
+ metric_name = '{}_{}'.format(side, metric[0])
+ meta = {
+ side: stat['pxname']
+ }
+
+ if metric[0] == 'status':
+ value = STATUS_MAP[value]
+ else:
+ value = int(value) if value else 0
+
+ yield {
+ 'type_instance': metric_name,
+ 'type': metric[1],
+ 'values': value,
+ 'meta': meta
+ }
+
+ # Count the number of servers per backend and state
+ backend_server_states = {}
+ for stat in itertools.ifilter(lambda x:
+ x['type'] == BACKEND_SERVER_TYPE, stats):
+ pxname = stat['pxname']
+ if pxname not in backend_server_states:
+ backend_server_states[pxname] = defaultdict(int)
+
+ # The status field for a server has the following syntax when a
+ # transition occurs with HAproxy >=1.6: "DOWN 17/30" or "UP 1/3".
+ status = stat['status'].split(' ')[0]
+
+ # We only pick up the UP and DOWN status while it can be one of
+ # NOLB/MAINT/MAINT(via)...
+ if status in STATUS_MAP:
+ backend_server_states[pxname][status] += 1
+ # Emit metric for the backend server
+ yield {
+ 'type_instance': 'backend_server',
+ 'values': STATUS_MAP[status],
+ 'meta': {
+ 'backend': pxname,
+ 'state': status.lower(),
+ 'server': stat['svname'],
+ }
+ }
+
+ for pxname, states in backend_server_states.iteritems():
+ for s in STATUS_MAP.keys():
+ yield {
+ 'type_instance': 'backend_servers',
+ 'values': states.get(s, 0),
+ 'meta': {
+ 'backend': pxname,
+ 'state': s.lower()
+ }
+ }
+
+ def config_callback(self, conf):
+ for node in conf.children:
+ if node.key == "ProxyMonitor":
+ self.proxy_monitors.append(node.values[0])
+ elif node.key == "ProxyIgnore":
+ self.proxy_ignore.append(node.values[0])
+ elif node.key == "Socket":
+ self.socket = node.values[0]
+ elif node.key == "Mapping":
+ self.names_mapping[node.values[0]] = node.values[1]
+ else:
+ self.logger.warning('Unknown config key: %s' % node.key)
+
+ if not self.proxy_monitors:
+ self.proxy_monitors += DEFAULT_PROXY_MONITORS
+ self.proxy_monitors = [p.lower() for p in self.proxy_monitors]
+
+
+plugin = HAProxyPlugin(collectd)
+
+
+def init_callback():
+ plugin.restore_sigchld()
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def read_callback():
+ plugin.read_callback()
+
+collectd.register_init(init_callback)
+collectd.register_config(config_callback)
+collectd.register_read(read_callback)
diff --git a/collectd/files/plugin/http_check.py b/collectd/files/plugin/http_check.py
new file mode 100644
index 0000000..d84899a
--- /dev/null
+++ b/collectd/files/plugin/http_check.py
@@ -0,0 +1,90 @@
+#!/usr/bin/python
+# Copyright 2016 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collectd
+import requests
+
+import collectd_base as base
+
+
+NAME = 'http_check'
+
+
+class HTTPCheckPlugin(base.Base):
+
+ def __init__(self, *args, **kwargs):
+ super(HTTPCheckPlugin, self).__init__(*args, **kwargs)
+ self.plugin = NAME
+ self.session = requests.Session()
+ self.session.mount(
+ 'http://',
+ requests.adapters.HTTPAdapter(max_retries=self.max_retries)
+ )
+ self.session.mount(
+ 'https://',
+ requests.adapters.HTTPAdapter(max_retries=self.max_retries)
+ )
+ self.urls = {}
+ self.expected_codes = {}
+
+ def config_callback(self, config):
+ super(HTTPCheckPlugin, self).config_callback(config)
+ for node in config.children:
+ if node.key == "Url":
+ self.urls[node.values[0]] = node.values[1]
+ elif node.key == 'ExpectedCode':
+ self.expected_codes[node.values[0]] = int(node.values[1])
+
+ def itermetrics(self):
+ for name, url in self.urls.items():
+ try:
+ r = self.session.get(url, timeout=self.timeout)
+ except Exception as e:
+ self.logger.warning("Got exception for '{}': {}".format(
+ url, e)
+ )
+ yield {'type_instance': name, 'values': self.FAIL}
+ else:
+
+ expected_code = self.expected_codes.get(name, 200)
+ if r.status_code != expected_code:
+ self.logger.warning(
+ ("{} ({}) responded with code {} "
+ "while {} is expected").format(name, url,
+ r.status_code,
+ expected_code))
+ yield {'type_instance': name, 'values': self.FAIL}
+ else:
+ self.logger.debug(
+ "Got response from {}: '{}'".format(url, r.text))
+ yield {'type_instance': name, 'values': self.OK}
+
+plugin = HTTPCheckPlugin(collectd)
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def notification_callback(notification):
+ plugin.notification_callback(notification)
+
+
+def read_callback():
+ plugin.conditional_read_callback()
+
+collectd.register_config(config_callback)
+collectd.register_notification(notification_callback)
+collectd.register_read(read_callback, base.INTERVAL)
diff --git a/collectd/files/plugin/hypervisor_stats.py b/collectd/files/plugin/hypervisor_stats.py
new file mode 100644
index 0000000..d03f7ae
--- /dev/null
+++ b/collectd/files/plugin/hypervisor_stats.py
@@ -0,0 +1,100 @@
+#!/usr/bin/python
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Collectd plugin for getting hypervisor statistics from Nova
+import collectd
+
+import collectd_openstack as openstack
+
+PLUGIN_NAME = 'hypervisor_stats'
+INTERVAL = openstack.INTERVAL
+
+
+class HypervisorStatsPlugin(openstack.CollectdPlugin):
+ """ Class to report the statistics on Nova hypervisors."""
+ VALUE_MAP = {
+ 'current_workload': 'running_tasks',
+ 'running_vms': 'running_instances',
+ 'local_gb_used': 'used_disk_GB',
+ 'free_disk_gb': 'free_disk_GB',
+ 'memory_mb_used': 'used_ram_MB',
+ 'free_ram_mb': 'free_ram_MB',
+ 'vcpus_used': 'used_vcpus',
+ }
+
+ def config_callback(self, config):
+ super(HypervisorStatsPlugin, self).config_callback(config)
+ for node in config.children:
+ if node.key == 'CpuAllocationRatio':
+ self.extra_config['cpu_ratio'] = float(node.values[0])
+ if 'cpu_ratio' not in self.extra_config:
+ self.logger.warning('CpuAllocationRatio parameter not set')
+
+ def dispatch_value(self, name, value, host=None):
+ v = collectd.Values(
+ plugin=PLUGIN_NAME,
+ type='gauge',
+ type_instance=name,
+ interval=INTERVAL,
+ # w/a for https://github.com/collectd/collectd/issues/716
+ meta={'0': True},
+ values=[value]
+ )
+ if host:
+ v.host = host
+ v.dispatch()
+
+ def collect(self):
+ r = self.get('nova', 'os-hypervisors/detail')
+ if not r:
+ self.logger.warning("Could not get hypervisor statistics")
+ return
+
+ total_stats = {v: 0 for v in self.VALUE_MAP.values()}
+ total_stats['free_vcpus'] = 0
+ hypervisor_stats = r.json().get('hypervisors', [])
+ for stats in hypervisor_stats:
+ # remove domain name and keep only the hostname portion
+ host = stats['hypervisor_hostname'].split('.')[0]
+ for k, v in self.VALUE_MAP.iteritems():
+ self.dispatch_value(v, stats.get(k, 0), host)
+ total_stats[v] += stats.get(k, 0)
+ if 'cpu_ratio' in self.extra_config:
+ free = (int(self.extra_config['cpu_ratio'] *
+ stats.get('vcpus', 0))) - stats.get('vcpus_used', 0)
+ self.dispatch_value('free_vcpus', free, host)
+ total_stats['free_vcpus'] += free
+
+ # Dispatch the global metrics
+ for k, v in total_stats.iteritems():
+ self.dispatch_value('total_{}'.format(k), v)
+
+plugin = HypervisorStatsPlugin(collectd, PLUGIN_NAME)
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def notification_callback(notification):
+ plugin.notification_callback(notification)
+
+
+def read_callback():
+ plugin.conditional_read_callback()
+
+collectd.register_config(config_callback)
+collectd.register_notification(notification_callback)
+collectd.register_read(read_callback, INTERVAL)
diff --git a/collectd/files/plugin/influxdb.py b/collectd/files/plugin/influxdb.py
new file mode 100644
index 0000000..43cd82d
--- /dev/null
+++ b/collectd/files/plugin/influxdb.py
@@ -0,0 +1,145 @@
+#!/usr/bin/python
+# Copyright 2016 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collectd
+import requests
+
+import collectd_base as base
+
+NAME = 'influxdb'
+METRICS_BY_NAME = {
+ 'cluster': {
+ 'writeShardPointsReq': ('cluster_write_shard_points_requests',
+ 'gauge'),
+ 'writeShardReq': ('cluster_write_shard_requests', 'gauge')},
+
+ 'httpd': {
+ 'authFail': ('httpd_failed_auths', 'gauge'),
+ 'pingReq': ('httpd_ping_requests', 'gauge'),
+ 'pointsWrittenOK': ('httpd_write_points_ok', 'gauge'),
+ 'queryReq': ('httpd_query_requests', 'gauge'),
+ 'queryRespBytes': ('httpd_query_response_bytes', 'gauge'),
+ 'req': ('httpd_requests', 'gauge'),
+ 'writeReq': ('httpd_write_requests', 'gauge'),
+ 'writeReqBytes': ('httpd_write_request_bytes', 'gauge')},
+
+ 'write': {
+ 'pointReq': ('write_point_requests', 'gauge'),
+ 'pointReqLocal': ('write_point_local_requests', 'gauge'),
+ 'pointReqRemote': ('write_point_remote_requests', 'gauge'),
+ 'req': ('write_requests', 'gauge'),
+ 'subWriteOk': ('write_sub_ok', 'gauge'),
+ 'writeOk': ('write_ok', 'gauge')},
+
+ 'runtime': {
+ 'Alloc': ('memory_alloc', 'gauge'),
+ 'TotalAlloc': ('memory_total_alloc', 'gauge'),
+ 'Sys': ('memory_system', 'gauge'),
+ 'Lookups': ('memory_lookups', 'gauge'),
+ 'Mallocs': ('memory_mallocs', 'gauge'),
+ 'Frees': ('memory_frees', 'gauge'),
+ 'HeapIdle': ('heap_idle', 'gauge'),
+ 'HeapInUse': ('heap_in_use', 'gauge'),
+ 'HeapObjects': ('heap_objects', 'gauge'),
+ 'HeapReleased': ('heap_released', 'gauge'),
+ 'HeapSys': ('heap_system', 'gauge'),
+ 'NumGC': ('garbage_collections', 'gauge'),
+ 'NumGoroutine': ('go_routines', 'gauge')}
+}
+
+
+class InfluxDBClusterPlugin(base.Base):
+ def __init__(self, *args, **kwargs):
+ super(InfluxDBClusterPlugin, self).__init__(*args, **kwargs)
+ self.plugin = NAME
+ self.session = requests.Session()
+ self.address = "localhost"
+ self.port = "8086"
+ self.session.mount(
+ 'http://',
+ requests.adapters.HTTPAdapter(max_retries=3)
+ )
+
+ def config_callback(self, conf):
+ super(InfluxDBClusterPlugin, self).config_callback(conf)
+
+ for node in conf.children:
+ if node.key == 'Username':
+ username = node.values[0]
+ elif node.key == 'Password':
+ password = node.values[0]
+ elif node.key == 'Address':
+ self.address = node.values[0]
+ elif node.key == 'Port':
+ self.port = node.values[0]
+
+ if username is None or password is None:
+ self.logger.error("Username and Password parameters are required")
+ else:
+ self.session.auth = (username, password)
+
+ def itermetrics(self):
+
+ payload = {'q': 'show stats'}
+ url = "http://{}:{}/query".format(self.address, self.port)
+
+ try:
+ r = self.session.get(url, params=payload)
+ except Exception as e:
+ msg = "Got {0} when getting stats from {1}".format(e, url)
+ raise base.CheckException(msg)
+
+ if r.status_code != 200:
+ msg = "Got response {0} from {0}".format(r.status_code, url)
+ raise base.CheckException(msg)
+
+ data = r.json()
+ try:
+ series_list = data['results'][0]['series']
+ except:
+ self.logger.error("Failed to retrieve series for InfluxDB cluster")
+ return
+
+ for serie in series_list:
+ metrics_list = METRICS_BY_NAME.get(serie['name'], None)
+ if not metrics_list:
+ continue
+ for i in range(len(serie['columns'])):
+ metric_name = serie['columns'][i]
+ if metric_name in metrics_list:
+ yield {
+ 'type_instance': metrics_list[metric_name][0],
+ 'type': metrics_list[metric_name][1],
+ 'values': [serie['values'][0][i]],
+ }
+
+
+plugin = InfluxDBClusterPlugin(collectd)
+
+
+def init_callback():
+ plugin.restore_sigchld()
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def read_callback():
+ plugin.read_callback()
+
+collectd.register_init(init_callback)
+collectd.register_config(config_callback)
+collectd.register_read(read_callback)
diff --git a/collectd/files/plugin/openstack_cinder.py b/collectd/files/plugin/openstack_cinder.py
new file mode 100644
index 0000000..3885c45
--- /dev/null
+++ b/collectd/files/plugin/openstack_cinder.py
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Collectd plugin for getting statistics from Cinder
+import collectd
+from collections import Counter
+from collections import defaultdict
+import re
+
+import collectd_openstack as openstack
+
+PLUGIN_NAME = 'cinder'
+INTERVAL = openstack.INTERVAL
+
+
+class CinderStatsPlugin(openstack.CollectdPlugin):
+ """ Class to report the statistics on Cinder service.
+
+ state of agents
+ number of volumes broken down by state
+ total size of volumes usable and in error state
+ """
+
+ states = {'up': 0, 'down': 1, 'disabled': 2}
+ cinder_re = re.compile('^cinder-')
+
+ def collect(self):
+
+ # Get information of the state per service
+ # State can be: 'up', 'down' or 'disabled'
+ aggregated_workers = defaultdict(Counter)
+
+ for worker in self.iter_workers('cinder'):
+ host = worker['host'].split('.')[0]
+ service = self.cinder_re.sub('', worker['service'])
+ state = worker['state']
+
+ aggregated_workers[service][state] += 1
+ self.dispatch_value('cinder_service', '',
+ self.states[state],
+ {'host': host,
+ 'service': service,
+ 'state': state})
+
+ for service in aggregated_workers:
+ for state in self.states:
+ self.dispatch_value('cinder_services', '',
+ aggregated_workers[service][state],
+ {'state': state, 'service': service})
+
+ volumes_details = self.get_objects_details('cinder', 'volumes')
+
+ def groupby(d):
+ return d.get('status', 'unknown').lower()
+
+ def count_size_bytes(d):
+ return d.get('size', 0) * 10 ** 9
+
+ status = self.count_objects_group_by(volumes_details,
+ group_by_func=groupby)
+ for s, nb in status.iteritems():
+ self.dispatch_value('volumes', s, nb)
+
+ sizes = self.count_objects_group_by(volumes_details,
+ group_by_func=groupby,
+ count_func=count_size_bytes)
+ for n, size in sizes.iteritems():
+ self.dispatch_value('volumes_size', n, size)
+
+ snaps_details = self.get_objects_details('cinder', 'snapshots')
+ status_snaps = self.count_objects_group_by(snaps_details,
+ group_by_func=groupby)
+ for s, nb in status_snaps.iteritems():
+ self.dispatch_value('snapshots', s, nb)
+
+ sizes = self.count_objects_group_by(snaps_details,
+ group_by_func=groupby,
+ count_func=count_size_bytes)
+ for n, size in sizes.iteritems():
+ self.dispatch_value('snapshots_size', n, size)
+
+ def dispatch_value(self, plugin_instance, name, value, meta=None):
+ v = collectd.Values(
+ plugin=PLUGIN_NAME, # metric source
+ plugin_instance=plugin_instance,
+ type='gauge',
+ type_instance=name,
+ interval=INTERVAL,
+ # w/a for https://github.com/collectd/collectd/issues/716
+ meta=meta or {'0': True},
+ values=[value]
+ )
+ v.dispatch()
+
+plugin = CinderStatsPlugin(collectd, PLUGIN_NAME)
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def notification_callback(notification):
+ plugin.notification_callback(notification)
+
+
+def read_callback():
+ plugin.conditional_read_callback()
+
+collectd.register_config(config_callback)
+collectd.register_notification(notification_callback)
+collectd.register_read(read_callback, INTERVAL)
diff --git a/collectd/files/plugin/openstack_glance.py b/collectd/files/plugin/openstack_glance.py
new file mode 100644
index 0000000..1077083
--- /dev/null
+++ b/collectd/files/plugin/openstack_glance.py
@@ -0,0 +1,101 @@
+#!/usr/bin/python
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Collectd plugin for getting resource statistics from Glance
+import collectd
+
+import collectd_openstack as openstack
+
+PLUGIN_NAME = 'glance'
+INTERVAL = openstack.INTERVAL
+
+
+class GlanceStatsPlugin(openstack.CollectdPlugin):
+ """ Class to report the statistics on Glance service.
+
+ number of image broken down by state
+ total size of images usable and in error state
+ """
+
+ def collect(self):
+
+ def is_snap(d):
+ return d.get('properties', {}).get('image_type') == 'snapshot'
+
+ def groupby(d):
+ p = 'public' if d.get('is_public', True) else 'private'
+ status = d.get('status', 'unknown').lower()
+ if is_snap(d):
+ return 'snapshots.%s.%s' % (p, status)
+ return 'images.%s.%s' % (p, status)
+
+ images_details = self.get_objects_details('glance', 'images',
+ api_version='v1',
+ params='is_public=None')
+ status = self.count_objects_group_by(images_details,
+ group_by_func=groupby)
+ for s, nb in status.iteritems():
+ (name, visibility, status) = s.split('.')
+ self.dispatch_value(name, nb, meta={'visibility': visibility,
+ 'status': status})
+
+ # sizes
+ def count_size_bytes(d):
+ return d.get('size', 0)
+
+ def groupby_size(d):
+ p = 'public' if d.get('is_public', True) else 'private'
+ status = d.get('status', 'unknown').lower()
+ if is_snap(d):
+ return 'snapshots_size.%s.%s' % (p, status)
+ return 'images_size.%s.%s' % (p, status)
+
+ sizes = self.count_objects_group_by(images_details,
+ group_by_func=groupby_size,
+ count_func=count_size_bytes)
+ for s, nb in sizes.iteritems():
+ (name, visibility, status) = s.split('.')
+ self.dispatch_value(name, nb, meta={'visibility': visibility,
+ 'status': status})
+
+ def dispatch_value(self, name, value, meta=None):
+ v = collectd.Values(
+ plugin=PLUGIN_NAME, # metric source
+ type='gauge',
+ type_instance=name,
+ interval=INTERVAL,
+ # w/a for https://github.com/collectd/collectd/issues/716
+ meta=meta or {'0': True},
+ values=[value]
+ )
+ v.dispatch()
+
+plugin = GlanceStatsPlugin(collectd, PLUGIN_NAME)
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def notification_callback(notification):
+ plugin.notification_callback(notification)
+
+
+def read_callback():
+ plugin.conditional_read_callback()
+
+collectd.register_config(config_callback)
+collectd.register_notification(notification_callback)
+collectd.register_read(read_callback, INTERVAL)
diff --git a/collectd/files/plugin/openstack_keystone.py b/collectd/files/plugin/openstack_keystone.py
new file mode 100644
index 0000000..c88c6e2
--- /dev/null
+++ b/collectd/files/plugin/openstack_keystone.py
@@ -0,0 +1,95 @@
+#!/usr/bin/python
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Collectd plugin for getting statistics from Keystone
+import collectd
+
+import collectd_openstack as openstack
+
+PLUGIN_NAME = 'keystone'
+INTERVAL = openstack.INTERVAL
+
+
+class KeystoneStatsPlugin(openstack.CollectdPlugin):
+ """ Class to report the statistics on Keystone service.
+
+ number of tenants, users broken down by state
+ number of roles
+ """
+
+ def collect(self):
+
+ def groupby(d):
+ return 'enabled' if d.get('enabled') else 'disabled'
+
+ # tenants
+ r = self.get('keystone', 'tenants')
+ if not r:
+ self.logger.warning('Could not find Keystone tenants')
+ return
+ tenants_details = r.json().get('tenants', [])
+ status = self.count_objects_group_by(tenants_details,
+ group_by_func=groupby)
+ for s, nb in status.iteritems():
+ self.dispatch_value('tenants', nb, meta={'state': s})
+
+ # users
+ r = self.get('keystone', 'users')
+ if not r:
+ self.logger.warning('Could not find Keystone users')
+ return
+ users_details = r.json().get('users', [])
+ status = self.count_objects_group_by(users_details,
+ group_by_func=groupby)
+ for s, nb in status.iteritems():
+ self.dispatch_value('users', nb, meta={'state': s})
+
+ # roles
+ r = self.get('keystone', 'OS-KSADM/roles')
+ if not r:
+ self.logger.warning('Could not find Keystone roles')
+ return
+ roles = r.json().get('roles', [])
+ self.dispatch_value('roles', len(roles))
+
+ def dispatch_value(self, name, value, meta=None):
+ v = collectd.Values(
+ plugin=PLUGIN_NAME, # metric source
+ type='gauge',
+ type_instance=name,
+ interval=INTERVAL,
+ # w/a for https://github.com/collectd/collectd/issues/716
+ meta=meta or {'0': True},
+ values=[value]
+ )
+ v.dispatch()
+
+plugin = KeystoneStatsPlugin(collectd, PLUGIN_NAME)
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def notification_callback(notification):
+ plugin.notification_callback(notification)
+
+
+def read_callback():
+ plugin.conditional_read_callback()
+
+collectd.register_config(config_callback)
+collectd.register_notification(notification_callback)
+collectd.register_read(read_callback, INTERVAL)
diff --git a/collectd/files/plugin/openstack_neutron.py b/collectd/files/plugin/openstack_neutron.py
new file mode 100644
index 0000000..6a2cf22
--- /dev/null
+++ b/collectd/files/plugin/openstack_neutron.py
@@ -0,0 +1,158 @@
+#!/usr/bin/python
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Collectd plugin for getting resource statistics from Neutron
+import collectd
+from collections import Counter
+from collections import defaultdict
+import re
+
+import collectd_openstack as openstack
+
+PLUGIN_NAME = 'neutron'
+INTERVAL = openstack.INTERVAL
+
+
+class NeutronStatsPlugin(openstack.CollectdPlugin):
+ """ Class to report the statistics on Neutron service.
+
+ state of agents
+ number of networks broken down by status
+ number of subnets
+ number of ports broken down by owner and status
+ number of routers broken down by status
+ number of floating IP addresses broken down by free/associated
+ """
+
+ neutron_re = re.compile('^neutron-')
+ agent_re = re.compile('-agent$')
+ states = {'up': 0, 'down': 1, 'disabled': 2}
+
+ def collect(self):
+
+ def groupby_network(x):
+ return "networks.%s" % x.get('status', 'unknown').lower()
+
+ def groupby_router(x):
+ return "routers.%s" % x.get('status', 'unknown').lower()
+
+ def groupby_port(x):
+ owner = x.get('device_owner', 'none')
+ if owner.startswith('network:'):
+ owner = owner.replace('network:', '')
+ elif owner.startswith('compute:'):
+ # The part after 'compute:' is the name of the Nova AZ
+ owner = 'compute'
+ else:
+ owner = 'none'
+ status = x.get('status', 'unknown').lower()
+ return "ports.%s.%s" % (owner, status)
+
+ def groupby_floating(x):
+ if x.get('port_id', None):
+ status = 'associated'
+ else:
+ status = 'free'
+ return "floatingips.%s" % status
+
+ # Get information of the state per agent
+ # State can be up or down
+ aggregated_agents = defaultdict(Counter)
+
+ for agent in self.iter_workers('neutron'):
+ host = agent['host'].split('.')[0]
+ service = self.agent_re.sub(
+ '', self.neutron_re.sub('', agent['service']))
+ state = agent['state']
+
+ aggregated_agents[service][state] += 1
+ self.dispatch_value('neutron_agent',
+ self.states[state],
+ {'host': host,
+ 'service': service,
+ 'state': state})
+
+ for service in aggregated_agents:
+ for state in self.states:
+ self.dispatch_value('neutron_agents',
+ aggregated_agents[service][state],
+ {'service': service, 'state': state})
+
+ # Networks
+ networks = self.get_objects('neutron', 'networks', api_version='v2.0')
+ status = self.count_objects_group_by(networks,
+ group_by_func=groupby_network)
+ for s, nb in status.iteritems():
+ self.dispatch_value(s, nb)
+ self.dispatch_value('networks', len(networks))
+
+ # Subnets
+ subnets = self.get_objects('neutron', 'subnets', api_version='v2.0')
+ self.dispatch_value('subnets', len(subnets))
+
+ # Ports
+ ports = self.get_objects('neutron', 'ports', api_version='v2.0')
+ status = self.count_objects_group_by(ports,
+ group_by_func=groupby_port)
+ for s, nb in status.iteritems():
+ self.dispatch_value(s, nb)
+ self.dispatch_value('ports', len(ports))
+
+ # Routers
+ routers = self.get_objects('neutron', 'routers', api_version='v2.0')
+ status = self.count_objects_group_by(routers,
+ group_by_func=groupby_router)
+ for s, nb in status.iteritems():
+ self.dispatch_value(s, nb)
+ self.dispatch_value('routers', len(routers))
+
+ # Floating IP addresses
+ floatingips = self.get_objects('neutron', 'floatingips',
+ api_version='v2.0')
+ status = self.count_objects_group_by(floatingips,
+ group_by_func=groupby_floating)
+ for s, nb in status.iteritems():
+ self.dispatch_value(s, nb)
+ self.dispatch_value('floatingips', len(floatingips))
+
+ def dispatch_value(self, name, value, meta=None):
+ v = collectd.Values(
+ plugin=PLUGIN_NAME, # metric source
+ type='gauge',
+ type_instance=name,
+ interval=INTERVAL,
+ # w/a for https://github.com/collectd/collectd/issues/716
+ meta=meta or {'0': True},
+ values=[value]
+ )
+ v.dispatch()
+
+plugin = NeutronStatsPlugin(collectd, PLUGIN_NAME)
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def notification_callback(notification):
+ plugin.notification_callback(notification)
+
+
+def read_callback():
+ plugin.conditional_read_callback()
+
+collectd.register_config(config_callback)
+collectd.register_notification(notification_callback)
+collectd.register_read(read_callback, INTERVAL)
diff --git a/collectd/files/plugin/openstack_nova.py b/collectd/files/plugin/openstack_nova.py
new file mode 100644
index 0000000..62c55da
--- /dev/null
+++ b/collectd/files/plugin/openstack_nova.py
@@ -0,0 +1,99 @@
+#!/usr/bin/python
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Collectd plugin for getting statistics from Nova
+import collectd
+from collections import Counter
+from collections import defaultdict
+import re
+
+import collectd_openstack as openstack
+
+PLUGIN_NAME = 'nova'
+INTERVAL = openstack.INTERVAL
+
+
+class NovaStatsPlugin(openstack.CollectdPlugin):
+ """ Class to report the statistics on Nova service.
+
+ status per service and number of instances broken down by state
+ """
+
+ states = {'up': 0, 'down': 1, 'disabled': 2}
+ nova_re = re.compile('^nova-')
+
+ def collect(self):
+
+ # Get information of the state per service
+ # State can be: 'up', 'down' or 'disabled'
+ aggregated_workers = defaultdict(Counter)
+
+ for worker in self.iter_workers('nova'):
+ host = worker['host'].split('.')[0]
+ service = self.nova_re.sub('', worker['service'])
+ state = worker['state']
+
+ aggregated_workers[service][state] += 1
+ self.dispatch_value('nova_service', '',
+ self.states[state],
+ {'host': host,
+ 'service': service,
+ 'state': state})
+
+ for service in aggregated_workers:
+ for state in self.states:
+ self.dispatch_value('nova_services', '',
+ aggregated_workers[service][state],
+ {'state': state, 'service': service})
+
+ servers_details = self.get_objects_details('nova', 'servers')
+
+ def groupby(d):
+ return d.get('status', 'unknown').lower()
+ status = self.count_objects_group_by(servers_details,
+ group_by_func=groupby)
+ for s, nb in status.iteritems():
+ self.dispatch_value('instances', s, nb)
+
+ def dispatch_value(self, plugin_instance, name, value, meta=None):
+ v = collectd.Values(
+ plugin=PLUGIN_NAME, # metric source
+ plugin_instance=plugin_instance,
+ type='gauge',
+ type_instance=name,
+ interval=INTERVAL,
+ # w/a for https://github.com/collectd/collectd/issues/716
+ meta=meta or {'0': True},
+ values=[value]
+ )
+ v.dispatch()
+
+plugin = NovaStatsPlugin(collectd, PLUGIN_NAME)
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def notification_callback(notification):
+ plugin.notification_callback(notification)
+
+
+def read_callback():
+ plugin.conditional_read_callback()
+
+collectd.register_config(config_callback)
+collectd.register_notification(notification_callback)
+collectd.register_read(read_callback, INTERVAL)
diff --git a/collectd/files/plugin/rabbitmq_info.py b/collectd/files/plugin/rabbitmq_info.py
new file mode 100644
index 0000000..c92ce0e
--- /dev/null
+++ b/collectd/files/plugin/rabbitmq_info.py
@@ -0,0 +1,133 @@
+# Name: rabbitmq-collectd-plugin - rabbitmq_info.py
+# Description: This plugin uses Collectd's Python plugin to obtain RabbitMQ
+# metrics.
+#
+# Copyright 2015 Mirantis, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collectd
+
+import collectd_base as base
+import requests
+
+NAME = 'rabbitmq_info'
+# Override in config by specifying 'Host'.
+HOST = '127.0.0.1'
+# Override in config by specifying 'Port'.
+PORT = '15672'
+
+
+class RabbitMqPlugin(base.Base):
+
+ def __init__(self, *args, **kwargs):
+ super(RabbitMqPlugin, self).__init__(*args, **kwargs)
+ self.plugin = NAME
+ self.username = None
+ self.password = None
+ self.host = HOST
+ self.port = PORT
+ self.session = None
+
+ def config_callback(self, conf):
+ super(RabbitMqPlugin, self).config_callback(conf)
+
+ for node in conf.children:
+ if node.key == 'Username':
+ self.username = node.values[0]
+ if node.key == 'Password':
+ self.password = node.values[0]
+ if node.key == 'Host':
+ self.host = node.values[0]
+ if node.key == 'Port':
+ self.port = node.values[0]
+
+ if not (self.username and self.password):
+ self.logger.error('Missing Username and Password parameters')
+
+ self.session = requests.Session()
+ self.session.auth = (self.username, self.password)
+ self.session.mount(
+ 'http://',
+ requests.adapters.HTTPAdapter(max_retries=self.max_retries)
+ )
+ url = "http://{}:{}".format(self.host, self.port)
+ self.api_nodes_url = "{}/api/nodes".format(url)
+ self.api_overview_url = "{}/api/overview".format(url)
+
+ def itermetrics(self):
+ stats = {}
+ try:
+ r = self.session.get(self.api_overview_url, timeout=self.timeout)
+ overview = r.json()
+ except Exception as e:
+ msg = "Got exception for '{}': {}".format(self.api_overview_url, e)
+ raise base.CheckException(msg)
+
+ if r.status_code != 200:
+ msg = "{} responded with code {}".format(
+ self.api_overview_url, r.status_code)
+ raise base.CheckException(msg)
+
+ objects = overview['object_totals']
+ stats['queues'] = objects['queues']
+ stats['consumers'] = objects['consumers']
+ stats['connections'] = objects['connections']
+ stats['exchanges'] = objects['exchanges']
+ stats['channels'] = objects['channels']
+ stats['messages'] = overview['queue_totals']['messages']
+ stats['running_nodes'] = len(overview['contexts'])
+
+ for k, v in stats.iteritems():
+ yield {'type_instance': k, 'values': v}
+
+ stats = {}
+ nodename = overview['node']
+ try:
+ r = self.session.get("{}/{}".format(self.api_nodes_url, nodename),
+ timeout=self.timeout)
+ node = r.json()
+ except Exception as e:
+ msg = "Got exception for '{}': {}".format(self.api_nodes_url, e)
+ raise base.CheckException(msg)
+
+ if r.status_code != 200:
+ msg = "{} responded with code {}".format(
+ self.api_nodes_url, r.status_code)
+ self.logger.error(msg)
+ raise base.CheckException(msg)
+
+ stats['disk_free_limit'] = node['disk_free_limit']
+ stats['disk_free'] = node['disk_free']
+ stats['remaining_disk'] = node['disk_free'] - node['disk_free_limit']
+
+ stats['used_memory'] = node['mem_used']
+ stats['vm_memory_limit'] = node['mem_limit']
+ stats['remaining_memory'] = node['mem_limit'] - node['mem_used']
+
+ for k, v in stats.iteritems():
+ yield {'type_instance': k, 'values': v}
+
+
+plugin = RabbitMqPlugin(collectd, 'rabbitmq')
+
+
+def config_callback(conf):
+ plugin.config_callback(conf)
+
+
+def read_callback():
+ plugin.read_callback()
+
+collectd.register_config(config_callback)
+collectd.register_read(read_callback)
diff --git a/collectd/files/plugin/types/ceph.db b/collectd/files/plugin/types/ceph.db
new file mode 100644
index 0000000..24ee88a
--- /dev/null
+++ b/collectd/files/plugin/types/ceph.db
@@ -0,0 +1,46 @@
+# For all types, plugin_instance contains the Ceph cluster name
+
+# cluster health metrics
+health value:GAUGE:1:3
+monitor_count value:GAUGE:0:U
+quorum_count value:GAUGE:0:U
+# number of placement groups per state, type_instance is the PG's state
+pg_state_count value:GAUGE:0:U
+# total number of objects
+objects_count value:GAUGE:0:U
+# total number of placement groups
+pg_count value:GAUGE:0:U
+# used, free and total amount of bytes
+pg_bytes used:GAUGE:0:U free:GAUGE:0:U total:GAUGE:0:U
+# amount of data bytes
+pg_data_bytes value:GAUGE:0:U
+
+# number of bytes used in the pool, type_instance is the pool name
+pool_bytes_used value:GAUGE:0:U
+# max number of bytes available in the pool, type_instance is the pool name
+pool_max_avail value:GAUGE:0:U
+# number of objects in the pool, type_instance is the pool name
+pool_objects value:GAUGE:0:U
+# number of pools
+pool_count value:GAUGE:0:U
+# used, free and total amount of bytes for all pools
+pool_total_bytes used:GAUGE:0:U free:GAUGE:0:U total:GAUGE:0:U
+# percentage of used, free and total space for all pools
+pool_total_percent used:GAUGE:0:101 free:GAUGE:0:101
+# number of bytes received and transmitted by second, type_instance is the pool name
+pool_bytes_rate rx:GAUGE:0:U tx:GAUGE:0:U
+# number of operations per second, type_instance is the pool name
+pool_ops_rate value:GAUGE:0:U
+# number of objects, type_instance is the pool name
+pool_pg_num value:GAUGE:0:U
+# number of placement groups, type_instance is the pool name
+pool_pg_placement_num value:GAUGE:0:U
+# size of the pool, type_instance is the pool name
+pool_size value:GAUGE:0:U
+
+# number of OSDs per state
+osd_count up:GAUGE:0:U down:GAUGE:0:U in:GAUGE:0:U out:GAUGE:0:U
+# amount of used and total space in bytes, type_instance is the OSD identifier
+osd_space used:GAUGE:0:U total:GAUGE:0:U
+# apply and commit latencies in milliseconds, type_instance is the OSD identifier
+osd_latency apply:GAUGE:0:U commit:GAUGE:0:U
diff --git a/collectd/files/plugin/types/ceph_perf.db b/collectd/files/plugin/types/ceph_perf.db
new file mode 100644
index 0000000..c557bb0
--- /dev/null
+++ b/collectd/files/plugin/types/ceph_perf.db
@@ -0,0 +1,74 @@
+# File generated automatically by the build_ceph_perf_types.py script
+# Ceph versions: 0.80.9, 0.94.5
+osd_agent_evict value:GAUGE:U:U
+osd_agent_flush value:GAUGE:U:U
+osd_agent_skip value:GAUGE:U:U
+osd_agent_wake value:GAUGE:U:U
+osd_buffer_bytes value:GAUGE:U:U
+osd_copyfrom value:GAUGE:U:U
+osd_heartbeat_from_peers value:GAUGE:U:U
+osd_heartbeat_to_peers value:GAUGE:U:U
+osd_loadavg value:GAUGE:U:U
+osd_map_message_epoch_dups value:GAUGE:U:U
+osd_map_message_epochs value:GAUGE:U:U
+osd_map_messages value:GAUGE:U:U
+osd_messages_delayed_for_map value:GAUGE:U:U
+osd_numpg value:GAUGE:U:U
+osd_numpg_primary value:GAUGE:U:U
+osd_numpg_replica value:GAUGE:U:U
+osd_numpg_stray value:GAUGE:U:U
+osd_object_ctx_cache_hit value:GAUGE:U:U
+osd_object_ctx_cache_total value:GAUGE:U:U
+osd_op value:GAUGE:U:U
+osd_op_in_bytes value:GAUGE:U:U
+osd_op_latency value:GAUGE:U:U
+osd_op_out_bytes value:GAUGE:U:U
+osd_op_process_latency value:GAUGE:U:U
+osd_op_r value:GAUGE:U:U
+osd_op_r_latency value:GAUGE:U:U
+osd_op_r_out_bytes value:GAUGE:U:U
+osd_op_r_process_latency value:GAUGE:U:U
+osd_op_rw value:GAUGE:U:U
+osd_op_rw_in_bytes value:GAUGE:U:U
+osd_op_rw_latency value:GAUGE:U:U
+osd_op_rw_out_bytes value:GAUGE:U:U
+osd_op_rw_process_latency value:GAUGE:U:U
+osd_op_rw_rlat value:GAUGE:U:U
+osd_op_w value:GAUGE:U:U
+osd_op_w_in_bytes value:GAUGE:U:U
+osd_op_w_latency value:GAUGE:U:U
+osd_op_w_process_latency value:GAUGE:U:U
+osd_op_w_rlat value:GAUGE:U:U
+osd_op_wip value:GAUGE:U:U
+osd_opq value:GAUGE:U:U
+osd_pull value:GAUGE:U:U
+osd_push value:GAUGE:U:U
+osd_push_in value:GAUGE:U:U
+osd_push_in_bytes value:GAUGE:U:U
+osd_push_out_bytes value:GAUGE:U:U
+osd_recovery_ops value:GAUGE:U:U
+osd_stat_bytes value:GAUGE:U:U
+osd_stat_bytes_avail value:GAUGE:U:U
+osd_stat_bytes_used value:GAUGE:U:U
+osd_subop value:GAUGE:U:U
+osd_subop_in_bytes value:GAUGE:U:U
+osd_subop_latency value:GAUGE:U:U
+osd_subop_pull value:GAUGE:U:U
+osd_subop_pull_latency value:GAUGE:U:U
+osd_subop_push value:GAUGE:U:U
+osd_subop_push_in_bytes value:GAUGE:U:U
+osd_subop_push_latency value:GAUGE:U:U
+osd_subop_w value:GAUGE:U:U
+osd_subop_w_in_bytes value:GAUGE:U:U
+osd_subop_w_latency value:GAUGE:U:U
+osd_tier_clean value:GAUGE:U:U
+osd_tier_delay value:GAUGE:U:U
+osd_tier_dirty value:GAUGE:U:U
+osd_tier_evict value:GAUGE:U:U
+osd_tier_flush value:GAUGE:U:U
+osd_tier_flush_fail value:GAUGE:U:U
+osd_tier_promote value:GAUGE:U:U
+osd_tier_proxy_read value:GAUGE:U:U
+osd_tier_try_flush value:GAUGE:U:U
+osd_tier_try_flush_fail value:GAUGE:U:U
+osd_tier_whiteout value:GAUGE:U:U
diff --git a/collectd/init.sls b/collectd/init.sls
index 6ecdd09..b794478 100644
--- a/collectd/init.sls
+++ b/collectd/init.sls
@@ -1,6 +1,4 @@
{%- if pillar.collectd is defined %}
include:
-{%- if pillar.collectd.client is defined %}
- collectd.client
-{%- endif %}
{%- endif %}
\ No newline at end of file
diff --git a/collectd/map.jinja b/collectd/map.jinja
index 2dadafd..dd0cdf7 100644
--- a/collectd/map.jinja
+++ b/collectd/map.jinja
@@ -5,17 +5,26 @@
'service': 'collectd',
'config_file': '/etc/collectd.conf',
'config_dir': '/etc/collectd.d',
+ 'read_interval': 60,
+ 'file_logging': True,
+ 'remote_collector': False
},
'Debian': {
'pkgs': ['collectd-core', 'snmp', 'python-yaml'],
'service': 'collectd',
'config_file': '/etc/collectd/collectd.conf',
'config_dir': '/etc/collectd/conf.d',
+ 'read_interval': 60,
+ 'file_logging': True,
+ 'remote_collector': False
},
'RedHat': {
'pkgs': ['collectd', 'collectd-ping', 'net-snmp', 'PyYAML'],
'service': 'collectd',
'config_file': '/etc/collectd.conf',
'config_dir': '/etc/collectd.d',
+ 'read_interval': 60,
+ 'file_logging': True,
+ 'remote_collector': False
},
}, merge=salt['pillar.get']('collectd:client')) %}
diff --git a/collectd/meta/collectd.yml b/collectd/meta/collectd.yml
index 20fd457..98d6ee1 100644
--- a/collectd/meta/collectd.yml
+++ b/collectd/meta/collectd.yml
@@ -1,29 +1,33 @@
plugin:
collectd_processes:
plugin: processes
- interval: 60
+ execution: local
template: collectd/files/collectd_processes.conf
process:
- collectdmon:
- match: collectdmon
+ collectdmon: {}
+ collectd_check_local_endpoint:
+ plugin: python
+ execution: local
+ template: collectd/files/collectd_check_local_endpoint.conf
+ endpoint: {}
{%- if pillar.collectd.client.get('check', {}).curl is defined %}
collectd_curl:
plugin: curl
- interval: 60
+ execution: local
template: collectd/files/collectd_curl.conf
data: {{ pillar.collectd.client.check.curl|yaml }}
{%- endif %}
{%- if pillar.collectd.client.get('check', {}).ping is defined %}
collectd_ping:
plugin: ping
- interval: 60
+ execution: local
template: collectd/files/collectd_ping.conf
data: {{ pillar.collectd.client.check.ping|yaml }}
{%- endif %}
{%- if pillar.get('external', {}).network_device is defined %}
collectd_network_device:
plugin: snmp
- interval: 60
+ execution: local
template: collectd/files/collectd_snmp.conf
data:
std_traffic:
@@ -40,5 +44,5 @@
values:
- 1.3.6.1.2.1.31.1.1.1.7
- 1.3.6.1.2.1.31.1.1.1.11
- host: {{ pillar.external.network_device }}
-{%- endif %}
+ host: {{ pillar.external.network_device|yaml }}
+{%- endif %}
\ No newline at end of file