Add support for Ceph monitoring

This change adds support for Ceph monitoring:
- service.monitoring.cluster_stats is applied to monitoring nodes for
  collecting cluster-wide metrics (through the Ceph CLI).
- service.monitoring.node_stats is applied to the Ceph nodes for
  collecting monitor and OSD metrics (through Unix sockets).

Because Telegraf runs as a container on the monitoring nodes and
requires a working Ceph client configuration, this  change also adds
support for deploying Ceph client in container mode.

Change-Id: If7359aca34a350f2c8ee2251bbe8a85314550a45
diff --git a/README.rst b/README.rst
index c34cdc6..a8fa04a 100644
--- a/README.rst
+++ b/README.rst
@@ -65,7 +65,7 @@
           glance:
             key: 00000000000000000000000000000000000000==
 
-Client pillar - ussually located at cinder-volume or glance-registry.
+Client pillar - usually located at cinder-volume or glance-registry.
 
 .. code-block:: yaml
 
@@ -92,6 +92,34 @@
           glance:
             key: 00000000000000000000000000000000000000==
 
+Monitoring Ceph cluster - collect cluster metrics
+
+.. code-block:: yaml
+
+    ceph:
+      client:
+        config:
+          global:
+            mon initial members: ceph1,ceph2,ceph3
+            mon host: 10.103.255.252:6789,10.103.255.253:6789,10.103.255.254:6789
+        keyring:
+          monitoring:
+            key: 00000000000000000000000000000000000000==
+      monitoring:
+        cluster_stats:
+          enabled: true
+          ceph_user: monitoring
+
+Monitoring Ceph services - collect metrics from monitor and OSD services
+
+.. code-block:: yaml
+
+    ceph:
+      monitoring:
+        node_stats:
+          enabled: true
+
+
 Read more
 =========
 
diff --git a/ceph/client.sls b/ceph/client.sls
index 3361772..bd027a6 100644
--- a/ceph/client.sls
+++ b/ceph/client.sls
@@ -1,11 +1,13 @@
 {%- from "ceph/map.jinja" import client with context %}
 {%- if client.enabled %}
 
+{% if not client.container_mode %}
 ceph_client_packages:
   pkg.installed:
   - names: {{ client.pkgs }}
+{%- endif %}
 
-/etc/ceph:
+{{ client.prefix_dir }}/etc/ceph:
   file.directory:
     - user: root
     - group: root
@@ -14,7 +16,7 @@
 
 {%- for keyring_name, keyring in client.keyring.iteritems() %}
 
-/etc/ceph/ceph.client.{{ keyring_name }}.keyring:
+{{ client.prefix_dir }}/etc/ceph/ceph.client.{{ keyring_name }}.keyring:
   file.managed:
     - user: root
     - group: root
@@ -24,13 +26,15 @@
     - contents: |
         [client.{{ keyring_name  }}]
     - require:
-      - file: /etc/ceph
+      - file: {{ client.prefix_dir }}/etc/ceph
 
   ini.options_present:
   - sections:
       client.{{ keyring_name }}: {{ keyring|yaml }}
+{% if not client.container_mode %}
   - require:
     - pkg: ceph_client_packages
+{%- endif %}
 
 {%- endfor %}
 
@@ -43,7 +47,7 @@
 {%- set _dummy = config.update(config_fragment) %}
 {%- endfor %}
 
-/etc/ceph/ceph.conf:
+{{ client.prefix_dir }}/etc/ceph/ceph.conf:
   file.managed:
     - user: root
     - group: root
@@ -53,12 +57,14 @@
     - contents: |
         [global]
     - require:
-      - file: /etc/ceph
+      - file: {{ client.prefix_dir }}/etc/ceph
 
   ini.options_present:
   - sections: {{ config|yaml }}
   - require:
+{% if not client.container_mode %}
     - pkg: ceph_client_packages
-    - file: /etc/ceph
+{%- endif %}
+    - file: {{ client.prefix_dir }}/etc/ceph
 
 {%- endif %}
diff --git a/ceph/files/telegraf.conf b/ceph/files/telegraf.conf
new file mode 100644
index 0000000..2ce6c27
--- /dev/null
+++ b/ceph/files/telegraf.conf
@@ -0,0 +1,33 @@
+[[inputs.ceph]]
+{%- if values.interval is defined %}
+  interval = "{{ values.interval }}"
+{%- endif %}
+  gather_admin_socket_stats = {{ values.gather_admin_socket_stats|lower }}
+  gather_cluster_stats = {{ values.gather_cluster_stats|lower }}
+  gather_pool_loads = {{ values.gather_pool_loads|lower }}
+{%- if values.ceph_binary is defined %}
+  ceph_binary = "{{ values.ceph_binary }}"
+{%- endif %}
+{%- if values.rados_binary is defined %}
+  rados_binary = "{{ values.rados_binary }}"
+{%- endif %}
+{%- if values.ceph_config is defined %}
+  ceph_config = "{{ values.ceph_config }}"
+{%- endif %}
+{%- if values.ceph_user is defined %}
+  ceph_user = "{{ values.ceph_user }}"
+{%- endif %}
+{%- if values.socket_dir is defined %}
+  socket_dir = "{{ values.socket_dir }}"
+{%- endif %}
+{%- if values.mon_prefix is defined %}
+  mon_prefix = "{{ values.mon_prefix }}"
+{%- endif %}
+{%- if values.osd_prefix is defined %}
+  osd_prefix = "{{ values.osd_prefix }}"
+{%- endif %}
+{%- if values.socket_suffix is defined %}
+  socket_suffix = "{{ values.socket_suffix }}"
+{%- endif %}
+
+{%- include 'telegraf/files/input/_tags.conf' %}
diff --git a/ceph/init.sls b/ceph/init.sls
index d44ffec..e6e7103 100644
--- a/ceph/init.sls
+++ b/ceph/init.sls
@@ -10,4 +10,7 @@
 {% endif %}
 {% if pillar.ceph.radosgw is defined %}
 - ceph.radosgw
-{% endif %}
\ No newline at end of file
+{% endif %}
+{% if pillar.ceph.monitoring is defined %}
+- ceph.monitoring
+{% endif %}
diff --git a/ceph/map.jinja b/ceph/map.jinja
index 8eabda2..061bfbc 100644
--- a/ceph/map.jinja
+++ b/ceph/map.jinja
@@ -23,12 +23,23 @@
 {% set client = salt['grains.filter_by']({
     'Debian': {
         'pkgs': ['ceph-common'],
+        'container_mode': False,
+        'prefix_dir': '',
     },
     'RedHat': {
         'pkgs': ['ceph-common'],
+        'container_mode': False,
+        'prefix_dir': '',
     },
 }, merge=salt['pillar.get']('ceph:client')) %}
 
+{% set monitoring = salt['grains.filter_by']({
+    'default': {
+        'cluster_stats': {},
+        'node_stats': {},
+    },
+}, merge=salt['pillar.get']('ceph:monitoring')) %}
+
 {% set radosgw = salt['grains.filter_by']({
     'Debian': {
         'pkgs': ['ceph-common','radosgw','python-rados','librados2'],
diff --git a/ceph/meta/telegraf.yml b/ceph/meta/telegraf.yml
new file mode 100644
index 0000000..b608585
--- /dev/null
+++ b/ceph/meta/telegraf.yml
@@ -0,0 +1,43 @@
+{%- from "ceph/map.jinja" import monitoring with context %}
+
+{%- if monitoring.cluster_stats.get('enabled') and monitoring.cluster_stats.ceph_user is defined %}
+remote_agent:
+  input:
+    ceph:
+      template: ceph/files/telegraf.conf
+      ceph_user: client.{{ monitoring.cluster_stats.ceph_user }}
+{%- if monitoring.cluster_stats.ceph_binary is defined %}
+      ceph_binary: {{ monitoring.cluster_stats.ceph_binary }}
+{%- endif %}
+{%- if monitoring.rados_binary is defined %}
+      rados_binary: {{ monitoring.cluster_stats.rados_binary }}
+{%- endif %}
+      gather_admin_socket_stats: false
+      gather_cluster_stats: true
+      gather_pool_loads: {{ monitoring.cluster_stats.gather_pool_loads|default('true') }}
+{%- if monitoring.interval is defined %}
+      interval: {{ monitoring.interval }}
+{%- endif %}
+{%- endif %}
+
+{%- if monitoring.node_stats.get('enabled') %}
+agent:
+  input:
+    ceph:
+      template: ceph/files/telegraf.conf
+{%- if monitoring.node_stats.socket_dir is defined %}
+      socket_dir: {{ monitoring.node_stats.socket_dir }}
+{%- endif %}
+{%- if monitoring.node_stats.mon_prefix is defined %}
+      mon_prefix: {{ monitoring.node_stats.mon_prefix }}
+{%- endif %}
+{%- if monitoring.node_stats.osd_prefix is defined %}
+      osd_prefix: {{ monitoring.node_stats.osd_prefix }}
+{%- endif %}
+      gather_admin_socket_stats: true
+      gather_cluster_stats: false
+      gather_pool_loads: false
+{%- if monitoring.interval is defined %}
+      interval: {{ monitoring.interval }}
+{%- endif %}
+{%- endif %}
diff --git a/ceph/monitoring.sls b/ceph/monitoring.sls
new file mode 100644
index 0000000..6328255
--- /dev/null
+++ b/ceph/monitoring.sls
@@ -0,0 +1,8 @@
+{#
+
+The ceph.monitoring state is only required for the Telegraf plugins, the
+Prometeus alerts and the Grafana dashboards.
+
+So everything happens in ceph/meta/(grafana|telegraf|prometheus).yml.
+
+#}
diff --git a/metadata/service/client/container.yml b/metadata/service/client/container.yml
new file mode 100644
index 0000000..dfcd5a6
--- /dev/null
+++ b/metadata/service/client/container.yml
@@ -0,0 +1,8 @@
+applications:
+- ceph
+parameters:
+  ceph:
+    client:
+      enabled: true
+      container_mode: true
+      prefix_dir: ${_param:ceph_client_prefix_dir}
diff --git a/metadata/service/monitoring/cluster_stats.yml b/metadata/service/monitoring/cluster_stats.yml
new file mode 100644
index 0000000..02fc70a
--- /dev/null
+++ b/metadata/service/monitoring/cluster_stats.yml
@@ -0,0 +1,10 @@
+applications:
+- ceph
+classes:
+- service.ceph.support
+parameters:
+  ceph:
+    monitoring:
+      cluster_stats:
+        enabled: true
+        ceph_user: ${_param:ceph_monitoring_user}
diff --git a/metadata/service/monitoring/node_stats.yml b/metadata/service/monitoring/node_stats.yml
new file mode 100644
index 0000000..4462309
--- /dev/null
+++ b/metadata/service/monitoring/node_stats.yml
@@ -0,0 +1,9 @@
+applications:
+- ceph
+classes:
+- service.ceph.support
+parameters:
+  ceph:
+    monitoring:
+      node_stats:
+        enabled: true
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index 532aed8..d245299 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -9,3 +9,5 @@
         enabled: true
       sphinx:
         enabled: true
+      telegraf:
+        enabled: true
diff --git a/tests/pillar/ceph_client_container.sls b/tests/pillar/ceph_client_container.sls
new file mode 100644
index 0000000..41fd7d1
--- /dev/null
+++ b/tests/pillar/ceph_client_container.sls
@@ -0,0 +1,26 @@
+ceph:
+  client:
+    enabled: true
+    container_mode: true
+    prefix_dir: /srv/volumes/ceph
+    config:
+      global:
+        fsid: 00000000-0000-0000-0000-000000000000
+        mon initial members: ceph1,ceph2,ceph3
+        mon host: 10.103.255.252:6789,10.103.255.253:6789,10.103.255.254:6789
+        osd_fs_mkfs_arguments_xfs:
+        osd_fs_mount_options_xfs: rw,noatime
+        network public: 10.0.0.0/24
+        network cluster: 10.0.0.0/24
+        osd_fs_type: xfs
+      osd:
+        osd journal size: 7500
+        filestore xattr use omap: true
+      mon:
+        mon debug dump transactions: false
+    keyring:
+      cinder:
+        key: 00000000000000000000000000000000000000==
+      glance:
+        key: 00000000000000000000000000000000000000==
+