monitoring

PROD-15486

Change-Id: Id7a1d0e080788602d28e3ddc75c1dc670e553865
diff --git a/README.rst b/README.rst
index 3a0abd2..71f2197 100644
--- a/README.rst
+++ b/README.rst
@@ -627,24 +627,22 @@
 Ceph monitoring
 ---------------
 
-Collect general cluster metrics
+By default monitoring is setup to collect information from MON and OSD nodes. To change the default values add the following pillar to MON nodes.
 
 .. code-block:: yaml
 
     ceph:
       monitoring:
-        cluster_stats:
-          enabled: true
-          ceph_user: monitoring
-
-Collect metrics from monitor and OSD services
-
-.. code-block:: yaml
-
-    ceph:
-      monitoring:
-        node_stats:
-          enabled: true
+        space_used_warning_threshold: 0.75
+        space_used_critical_threshold: 0.85
+        apply_latency_threshold: 0.007
+        commit_latency_threshold: 0.7
+        pool_space_used_utilization_warning_threshold: 0.75
+        pool_space_used_critical_threshold: 0.85
+        pool_write_ops_threshold: 200
+        pool_write_bytes_threshold: 70000000
+        pool_read_bytes_threshold: 70000000
+        pool_read_ops_threshold: 1000
 
 
 More information
diff --git a/ceph/files/grafana_dashboards/ceph_cluster_prometheus.json b/ceph/files/grafana_dashboards/ceph_cluster_prometheus.json
index 40b8f92..4f99242 100644
--- a/ceph/files/grafana_dashboards/ceph_cluster_prometheus.json
+++ b/ceph/files/grafana_dashboards/ceph_cluster_prometheus.json
@@ -170,7 +170,7 @@
           "tableColumn": "",
           "targets": [
             {
-              "expr": "sum(ceph_service_service_health{service=\"mons\"})",
+              "expr": "ceph_num_mon_quorum",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,
diff --git a/ceph/files/telegraf.conf b/ceph/files/telegraf.conf
index 2ce6c27..3e8ad9b 100644
--- a/ceph/files/telegraf.conf
+++ b/ceph/files/telegraf.conf
@@ -4,13 +4,9 @@
 {%- endif %}
   gather_admin_socket_stats = {{ values.gather_admin_socket_stats|lower }}
   gather_cluster_stats = {{ values.gather_cluster_stats|lower }}
-  gather_pool_loads = {{ values.gather_pool_loads|lower }}
 {%- if values.ceph_binary is defined %}
   ceph_binary = "{{ values.ceph_binary }}"
 {%- endif %}
-{%- if values.rados_binary is defined %}
-  rados_binary = "{{ values.rados_binary }}"
-{%- endif %}
 {%- if values.ceph_config is defined %}
   ceph_config = "{{ values.ceph_config }}"
 {%- endif %}
diff --git a/ceph/meta/prometheus.yml b/ceph/meta/prometheus.yml
index ad98a5f..fe2cb57 100644
--- a/ceph/meta/prometheus.yml
+++ b/ceph/meta/prometheus.yml
@@ -1,6 +1,6 @@
-{%- from "ceph/map.jinja" import monitoring with context %}
+{%- from "ceph/map.jinja" import mon, monitoring, setup with context %}
 
-{%- if monitoring.cluster_stats.get('enabled') and monitoring.cluster_stats.ceph_user is defined %}
+{%- if (mon is defined and mon.get('enabled')) or (monitoring.cluster_stats.get('enabled') and monitoring.cluster_stats.ceph_user is defined) %}
 {% raw %}
 server:
   alert:
@@ -22,5 +22,129 @@
       annotations:
         summary: "Ceph health warning"
         description: "Ceph health is 'warning'. Run 'ceph -s' to get details."
+    CephNumMonQuorumWarning:
+      if: >-
+        ceph_num_mon > ceph_num_mon_quorum
+      labels:
+        severity: warning
+        service: ceph
+      annotations:
+        summary: "Ceph Mon node down warning"
+        description: "Ceph Mon node is down. Run 'ceph -s' to get details."
+    CephNumOsdWarning:
+      if: >-
+        ceph_osdmap_num_osds > ceph_osdmap_num_up_osds
+      labels:
+        severity: warning
+        service: ceph
+      annotations:
+        summary: "Ceph OSDs down warning"
+        description: "Ceph OSD is down. Run 'ceph osd tree' to get details."
+    CephUsedSpaceWarning:
+      if: >-
+        ceph_osd_bytes_used / ceph_osd_bytes > {%- endraw %} {{monitoring.space_used_warning_threshold|default('0.75')|float}} {%- raw %}
+      labels:
+        severity: warning
+        service: ceph
+      annotations:
+        summary: "Ceph used space warning"
+        description: "Ceph OSD free space utilization warning. Run 'ceph df' to get details."
+    CephUsedSpaceCritical:
+      if: >-
+        ceph_osd_bytes_used / ceph_osd_bytes > {%- endraw %} {{monitoring.space_used_critical_threshold|default('0.85')|float}} {%- raw %}
+      labels:
+        severity: critical
+        service: ceph
+      annotations:
+        summary: "Ceph used space critical"
+        description: "Ceph OSD free space utilization critical. Run 'ceph df' to get details."
+    CephApplyLatencyTooHigh:
+      if: >-
+        avg(ceph_apply_latency_sum) / avg(ceph_apply_latency_avgcount) > {%- endraw %} {{monitoring.apply_latency_threshold|default('0.007')|float}} {%- raw %}
+      labels:
+        severity: warning
+        service: ceph
+      annotations:
+        summary: "Ceph apply latency too high"
+        description: "Ceph apply latency too high."
+    CephCommitLatencyTooHigh:
+      if: >-
+        avg(ceph_commit_latency_sum) / avg(ceph_commitcycle_latency_avgcount) > {%- endraw %} {{monitoring.commit_latency_threshold|default('0.7')|float}} {%- raw %}
+      labels:
+        severity: warning
+        service: ceph
+      annotations:
+        summary: "Ceph commit latency too high"
+        description: "Ceph commit latency too high."
 {% endraw %}
+{%- if setup.pool is defined %}
+{%- for pool_name, pool in setup.pool.iteritems() %}
+    CephPoolUsedSpaceWarning{{pool_name|replace(".", "")|replace("-", "")}}:
+{% raw %}
+      if: >-
+        ceph_pool_usage_bytes_used{name="{% endraw %}{{pool_name}}{% raw %}"} / ceph_pool_usage_max_avail{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_space_used_utilization_warning_threshold|default('0.75')|float}} {% raw %}
+      labels:
+        severity: warning
+        service: ceph
+      annotations:
+        summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} space utilization warning"
+        description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} free space utilization warning. Run 'ceph df' to get details."
+{% endraw %}
+    CephPoolUsedSpaceCritical{{pool_name|replace(".", "")|replace("-", "")}}:
+{% raw %}
+      if: >-
+        ceph_pool_usage_bytes_used{name="{% endraw %}{{pool_name}}{% raw %}"} / ceph_pool_usage_max_avail{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_space_used_critical_threshold|default('0.85')|float}} {% raw %}
+      labels:
+        severity: critical
+        service: ceph
+      annotations:
+        summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} space utilization critical"
+        description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} free space utilization critical. Run 'ceph df' to get details."
+{% endraw %}
+    CephPoolWriteOpsTooHigh{{pool_name|replace(".", "")|replace("-", "")}}:
+{% raw %}
+      if: >-
+        ceph_pool_stats_write_op_per_sec{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_write_ops_threshold|default('200')|float}} {% raw %}
+      labels:
+        severity: warning
+        service: ceph
+      annotations:
+        summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} write ops too high"
+        description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} write ops too high."
+{% endraw %}
+    CephPoolWriteBytesTooHigh{{pool_name|replace(".", "")|replace("-", "")}}:
+{% raw %}
+      if: >-
+        ceph_pool_stats_write_bytes_sec{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_write_bytes_threshold|default('70000000')|float}} {% raw %}
+      labels:
+        severity: warning
+        service: ceph
+      annotations:
+        summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} write bytes too high"
+        description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} write bytes too high."
+{% endraw %}
+    CephPoolReadBytesTooHigh{{pool_name|replace(".", "")|replace("-", "")}}:
+{% raw %}
+      if: >-
+        ceph_pool_stats_read_bytes_sec{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_read_bytes_threshold|default('70000000')|float}} {% raw %}
+      labels:
+        severity: warning
+        service: ceph
+      annotations:
+        summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} read bytes too high"
+        description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} read bytes too high."
+{% endraw %}
+    CephPoolReadOpsTooHigh{{pool_name|replace(".", "")|replace("-", "")}}:
+{% raw %}
+      if: >-
+        ceph_pool_stats_read_op_per_sec{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_read_ops_threshold|default('1000')|float}} {% raw %}
+      labels:
+        severity: warning
+        service: ceph
+      annotations:
+        summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} read ops too high"
+        description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} read ops too high."
+{% endraw %}
+{%- endfor %}
+{%- endif %}
 {%- endif %}
diff --git a/ceph/meta/telegraf.yml b/ceph/meta/telegraf.yml
index b608585..f544e37 100644
--- a/ceph/meta/telegraf.yml
+++ b/ceph/meta/telegraf.yml
@@ -1,30 +1,73 @@
-{%- from "ceph/map.jinja" import monitoring with context %}
+{%- from "ceph/map.jinja" import mon, osd, monitoring with context %}
 
-{%- if monitoring.cluster_stats.get('enabled') and monitoring.cluster_stats.ceph_user is defined %}
+{%- if mon is defined and mon.get('enabled') %}
 remote_agent:
   input:
     ceph:
       template: ceph/files/telegraf.conf
-      ceph_user: client.{{ monitoring.cluster_stats.ceph_user }}
+{%- if monitoring.cluster_stats is defined %}
+      ceph_user: client.{{ monitoring.cluster_stats.ceph_user|default('admin') }}
+      gather_admin_socket_stats: {{ monitoring.cluster_stats.gather_admin_socket_stats|default('false') }}
+      gather_cluster_stats: {{ monitoring.cluster_stats.gather_cluster_stats|default('true') }}
 {%- if monitoring.cluster_stats.ceph_binary is defined %}
       ceph_binary: {{ monitoring.cluster_stats.ceph_binary }}
 {%- endif %}
-{%- if monitoring.rados_binary is defined %}
-      rados_binary: {{ monitoring.cluster_stats.rados_binary }}
+{%- if monitoring.cluster_stats.socket_dir is defined %}
+      socket_dir: {{ monitoring.cluster_stats.socket_dir }}
 {%- endif %}
-      gather_admin_socket_stats: false
-      gather_cluster_stats: true
-      gather_pool_loads: {{ monitoring.cluster_stats.gather_pool_loads|default('true') }}
+{%- if monitoring.cluster_stats.mon_prefix is defined %}
+      mon_prefix: {{ monitoring.cluster_stats.mon_prefix }}
+{%- endif %}
+{%- if monitoring.cluster_stats.osd_prefix is defined %}
+      osd_prefix: {{ monitoring.cluster_stats.osd_prefix }}
+{%- endif %}
 {%- if monitoring.interval is defined %}
       interval: {{ monitoring.interval }}
 {%- endif %}
+{%- else  %}
+      ceph_user: client.admin
+      gather_admin_socket_stats: false
+      gather_cluster_stats: true
 {%- endif %}
-
-{%- if monitoring.node_stats.get('enabled') %}
 agent:
   input:
     ceph:
       template: ceph/files/telegraf.conf
+{%- if monitoring.cluster_stats is defined %}
+      ceph_user: client.{{ monitoring.cluster_stats.ceph_user|default('admin') }}
+      gather_admin_socket_stats: {{ monitoring.cluster_stats.gather_admin_socket_stats|default('true') }}
+      gather_cluster_stats: {{ monitoring.cluster_stats.gather_cluster_stats|default('false') }}
+{%- if monitoring.cluster_stats.ceph_binary is defined %}
+      ceph_binary: {{ monitoring.cluster_stats.ceph_binary }}
+{%- endif %}
+{%- if monitoring.cluster_stats.socket_dir is defined %}
+      socket_dir: {{ monitoring.cluster_stats.socket_dir }}
+{%- endif %}
+{%- if monitoring.cluster_stats.mon_prefix is defined %}
+      mon_prefix: {{ monitoring.cluster_stats.mon_prefix }}
+{%- endif %}
+{%- if monitoring.cluster_stats.osd_prefix is defined %}
+      osd_prefix: {{ monitoring.cluster_stats.osd_prefix }}
+{%- endif %}
+{%- if monitoring.interval is defined %}
+      interval: {{ monitoring.interval }}
+{%- endif %}
+{%- else  %}
+      ceph_user: client.admin
+      gather_admin_socket_stats: true
+      gather_cluster_stats: false
+{%- endif %}
+
+{%- endif %}
+
+{%- if osd is defined and osd.get('enabled') %}
+agent:
+  input:
+    ceph:
+      template: ceph/files/telegraf.conf
+{%- if monitoring.node_stats is defined %}
+      gather_admin_socket_stats: {{ monitoring.node_stats.gather_admin_socket_stats|default('true') }}
+      gather_cluster_stats: {{ monitoring.node_stats.gather_cluster_stats|default('false') }}
 {%- if monitoring.node_stats.socket_dir is defined %}
       socket_dir: {{ monitoring.node_stats.socket_dir }}
 {%- endif %}
@@ -34,10 +77,11 @@
 {%- if monitoring.node_stats.osd_prefix is defined %}
       osd_prefix: {{ monitoring.node_stats.osd_prefix }}
 {%- endif %}
-      gather_admin_socket_stats: true
-      gather_cluster_stats: false
-      gather_pool_loads: false
 {%- if monitoring.interval is defined %}
       interval: {{ monitoring.interval }}
 {%- endif %}
+{%- else %}
+      gather_admin_socket_stats: true
+      gather_cluster_stats: false
+{%- endif %}
 {%- endif %}