monitoring
PROD-15486
Change-Id: Id7a1d0e080788602d28e3ddc75c1dc670e553865
diff --git a/README.rst b/README.rst
index 3a0abd2..71f2197 100644
--- a/README.rst
+++ b/README.rst
@@ -627,24 +627,22 @@
Ceph monitoring
---------------
-Collect general cluster metrics
+By default monitoring is setup to collect information from MON and OSD nodes. To change the default values add the following pillar to MON nodes.
.. code-block:: yaml
ceph:
monitoring:
- cluster_stats:
- enabled: true
- ceph_user: monitoring
-
-Collect metrics from monitor and OSD services
-
-.. code-block:: yaml
-
- ceph:
- monitoring:
- node_stats:
- enabled: true
+ space_used_warning_threshold: 0.75
+ space_used_critical_threshold: 0.85
+ apply_latency_threshold: 0.007
+ commit_latency_threshold: 0.7
+ pool_space_used_utilization_warning_threshold: 0.75
+ pool_space_used_critical_threshold: 0.85
+ pool_write_ops_threshold: 200
+ pool_write_bytes_threshold: 70000000
+ pool_read_bytes_threshold: 70000000
+ pool_read_ops_threshold: 1000
More information
diff --git a/ceph/files/grafana_dashboards/ceph_cluster_prometheus.json b/ceph/files/grafana_dashboards/ceph_cluster_prometheus.json
index 40b8f92..4f99242 100644
--- a/ceph/files/grafana_dashboards/ceph_cluster_prometheus.json
+++ b/ceph/files/grafana_dashboards/ceph_cluster_prometheus.json
@@ -170,7 +170,7 @@
"tableColumn": "",
"targets": [
{
- "expr": "sum(ceph_service_service_health{service=\"mons\"})",
+ "expr": "ceph_num_mon_quorum",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
diff --git a/ceph/files/telegraf.conf b/ceph/files/telegraf.conf
index 2ce6c27..3e8ad9b 100644
--- a/ceph/files/telegraf.conf
+++ b/ceph/files/telegraf.conf
@@ -4,13 +4,9 @@
{%- endif %}
gather_admin_socket_stats = {{ values.gather_admin_socket_stats|lower }}
gather_cluster_stats = {{ values.gather_cluster_stats|lower }}
- gather_pool_loads = {{ values.gather_pool_loads|lower }}
{%- if values.ceph_binary is defined %}
ceph_binary = "{{ values.ceph_binary }}"
{%- endif %}
-{%- if values.rados_binary is defined %}
- rados_binary = "{{ values.rados_binary }}"
-{%- endif %}
{%- if values.ceph_config is defined %}
ceph_config = "{{ values.ceph_config }}"
{%- endif %}
diff --git a/ceph/meta/prometheus.yml b/ceph/meta/prometheus.yml
index ad98a5f..fe2cb57 100644
--- a/ceph/meta/prometheus.yml
+++ b/ceph/meta/prometheus.yml
@@ -1,6 +1,6 @@
-{%- from "ceph/map.jinja" import monitoring with context %}
+{%- from "ceph/map.jinja" import mon, monitoring, setup with context %}
-{%- if monitoring.cluster_stats.get('enabled') and monitoring.cluster_stats.ceph_user is defined %}
+{%- if (mon is defined and mon.get('enabled')) or (monitoring.cluster_stats.get('enabled') and monitoring.cluster_stats.ceph_user is defined) %}
{% raw %}
server:
alert:
@@ -22,5 +22,129 @@
annotations:
summary: "Ceph health warning"
description: "Ceph health is 'warning'. Run 'ceph -s' to get details."
+ CephNumMonQuorumWarning:
+ if: >-
+ ceph_num_mon > ceph_num_mon_quorum
+ labels:
+ severity: warning
+ service: ceph
+ annotations:
+ summary: "Ceph Mon node down warning"
+ description: "Ceph Mon node is down. Run 'ceph -s' to get details."
+ CephNumOsdWarning:
+ if: >-
+ ceph_osdmap_num_osds > ceph_osdmap_num_up_osds
+ labels:
+ severity: warning
+ service: ceph
+ annotations:
+ summary: "Ceph OSDs down warning"
+ description: "Ceph OSD is down. Run 'ceph osd tree' to get details."
+ CephUsedSpaceWarning:
+ if: >-
+ ceph_osd_bytes_used / ceph_osd_bytes > {%- endraw %} {{monitoring.space_used_warning_threshold|default('0.75')|float}} {%- raw %}
+ labels:
+ severity: warning
+ service: ceph
+ annotations:
+ summary: "Ceph used space warning"
+ description: "Ceph OSD free space utilization warning. Run 'ceph df' to get details."
+ CephUsedSpaceCritical:
+ if: >-
+ ceph_osd_bytes_used / ceph_osd_bytes > {%- endraw %} {{monitoring.space_used_critical_threshold|default('0.85')|float}} {%- raw %}
+ labels:
+ severity: critical
+ service: ceph
+ annotations:
+ summary: "Ceph used space critical"
+ description: "Ceph OSD free space utilization critical. Run 'ceph df' to get details."
+ CephApplyLatencyTooHigh:
+ if: >-
+ avg(ceph_apply_latency_sum) / avg(ceph_apply_latency_avgcount) > {%- endraw %} {{monitoring.apply_latency_threshold|default('0.007')|float}} {%- raw %}
+ labels:
+ severity: warning
+ service: ceph
+ annotations:
+ summary: "Ceph apply latency too high"
+ description: "Ceph apply latency too high."
+ CephCommitLatencyTooHigh:
+ if: >-
+ avg(ceph_commit_latency_sum) / avg(ceph_commitcycle_latency_avgcount) > {%- endraw %} {{monitoring.commit_latency_threshold|default('0.7')|float}} {%- raw %}
+ labels:
+ severity: warning
+ service: ceph
+ annotations:
+ summary: "Ceph commit latency too high"
+ description: "Ceph commit latency too high."
{% endraw %}
+{%- if setup.pool is defined %}
+{%- for pool_name, pool in setup.pool.iteritems() %}
+ CephPoolUsedSpaceWarning{{pool_name|replace(".", "")|replace("-", "")}}:
+{% raw %}
+ if: >-
+ ceph_pool_usage_bytes_used{name="{% endraw %}{{pool_name}}{% raw %}"} / ceph_pool_usage_max_avail{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_space_used_utilization_warning_threshold|default('0.75')|float}} {% raw %}
+ labels:
+ severity: warning
+ service: ceph
+ annotations:
+ summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} space utilization warning"
+ description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} free space utilization warning. Run 'ceph df' to get details."
+{% endraw %}
+ CephPoolUsedSpaceCritical{{pool_name|replace(".", "")|replace("-", "")}}:
+{% raw %}
+ if: >-
+ ceph_pool_usage_bytes_used{name="{% endraw %}{{pool_name}}{% raw %}"} / ceph_pool_usage_max_avail{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_space_used_critical_threshold|default('0.85')|float}} {% raw %}
+ labels:
+ severity: critical
+ service: ceph
+ annotations:
+ summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} space utilization critical"
+ description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} free space utilization critical. Run 'ceph df' to get details."
+{% endraw %}
+ CephPoolWriteOpsTooHigh{{pool_name|replace(".", "")|replace("-", "")}}:
+{% raw %}
+ if: >-
+ ceph_pool_stats_write_op_per_sec{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_write_ops_threshold|default('200')|float}} {% raw %}
+ labels:
+ severity: warning
+ service: ceph
+ annotations:
+ summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} write ops too high"
+ description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} write ops too high."
+{% endraw %}
+ CephPoolWriteBytesTooHigh{{pool_name|replace(".", "")|replace("-", "")}}:
+{% raw %}
+ if: >-
+ ceph_pool_stats_write_bytes_sec{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_write_bytes_threshold|default('70000000')|float}} {% raw %}
+ labels:
+ severity: warning
+ service: ceph
+ annotations:
+ summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} write bytes too high"
+ description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} write bytes too high."
+{% endraw %}
+ CephPoolReadBytesTooHigh{{pool_name|replace(".", "")|replace("-", "")}}:
+{% raw %}
+ if: >-
+ ceph_pool_stats_read_bytes_sec{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_read_bytes_threshold|default('70000000')|float}} {% raw %}
+ labels:
+ severity: warning
+ service: ceph
+ annotations:
+ summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} read bytes too high"
+ description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} read bytes too high."
+{% endraw %}
+ CephPoolReadOpsTooHigh{{pool_name|replace(".", "")|replace("-", "")}}:
+{% raw %}
+ if: >-
+ ceph_pool_stats_read_op_per_sec{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_read_ops_threshold|default('1000')|float}} {% raw %}
+ labels:
+ severity: warning
+ service: ceph
+ annotations:
+ summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} read ops too high"
+ description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} read ops too high."
+{% endraw %}
+{%- endfor %}
+{%- endif %}
{%- endif %}
diff --git a/ceph/meta/telegraf.yml b/ceph/meta/telegraf.yml
index b608585..f544e37 100644
--- a/ceph/meta/telegraf.yml
+++ b/ceph/meta/telegraf.yml
@@ -1,30 +1,73 @@
-{%- from "ceph/map.jinja" import monitoring with context %}
+{%- from "ceph/map.jinja" import mon, osd, monitoring with context %}
-{%- if monitoring.cluster_stats.get('enabled') and monitoring.cluster_stats.ceph_user is defined %}
+{%- if mon is defined and mon.get('enabled') %}
remote_agent:
input:
ceph:
template: ceph/files/telegraf.conf
- ceph_user: client.{{ monitoring.cluster_stats.ceph_user }}
+{%- if monitoring.cluster_stats is defined %}
+ ceph_user: client.{{ monitoring.cluster_stats.ceph_user|default('admin') }}
+ gather_admin_socket_stats: {{ monitoring.cluster_stats.gather_admin_socket_stats|default('false') }}
+ gather_cluster_stats: {{ monitoring.cluster_stats.gather_cluster_stats|default('true') }}
{%- if monitoring.cluster_stats.ceph_binary is defined %}
ceph_binary: {{ monitoring.cluster_stats.ceph_binary }}
{%- endif %}
-{%- if monitoring.rados_binary is defined %}
- rados_binary: {{ monitoring.cluster_stats.rados_binary }}
+{%- if monitoring.cluster_stats.socket_dir is defined %}
+ socket_dir: {{ monitoring.cluster_stats.socket_dir }}
{%- endif %}
- gather_admin_socket_stats: false
- gather_cluster_stats: true
- gather_pool_loads: {{ monitoring.cluster_stats.gather_pool_loads|default('true') }}
+{%- if monitoring.cluster_stats.mon_prefix is defined %}
+ mon_prefix: {{ monitoring.cluster_stats.mon_prefix }}
+{%- endif %}
+{%- if monitoring.cluster_stats.osd_prefix is defined %}
+ osd_prefix: {{ monitoring.cluster_stats.osd_prefix }}
+{%- endif %}
{%- if monitoring.interval is defined %}
interval: {{ monitoring.interval }}
{%- endif %}
+{%- else %}
+ ceph_user: client.admin
+ gather_admin_socket_stats: false
+ gather_cluster_stats: true
{%- endif %}
-
-{%- if monitoring.node_stats.get('enabled') %}
agent:
input:
ceph:
template: ceph/files/telegraf.conf
+{%- if monitoring.cluster_stats is defined %}
+ ceph_user: client.{{ monitoring.cluster_stats.ceph_user|default('admin') }}
+ gather_admin_socket_stats: {{ monitoring.cluster_stats.gather_admin_socket_stats|default('true') }}
+ gather_cluster_stats: {{ monitoring.cluster_stats.gather_cluster_stats|default('false') }}
+{%- if monitoring.cluster_stats.ceph_binary is defined %}
+ ceph_binary: {{ monitoring.cluster_stats.ceph_binary }}
+{%- endif %}
+{%- if monitoring.cluster_stats.socket_dir is defined %}
+ socket_dir: {{ monitoring.cluster_stats.socket_dir }}
+{%- endif %}
+{%- if monitoring.cluster_stats.mon_prefix is defined %}
+ mon_prefix: {{ monitoring.cluster_stats.mon_prefix }}
+{%- endif %}
+{%- if monitoring.cluster_stats.osd_prefix is defined %}
+ osd_prefix: {{ monitoring.cluster_stats.osd_prefix }}
+{%- endif %}
+{%- if monitoring.interval is defined %}
+ interval: {{ monitoring.interval }}
+{%- endif %}
+{%- else %}
+ ceph_user: client.admin
+ gather_admin_socket_stats: true
+ gather_cluster_stats: false
+{%- endif %}
+
+{%- endif %}
+
+{%- if osd is defined and osd.get('enabled') %}
+agent:
+ input:
+ ceph:
+ template: ceph/files/telegraf.conf
+{%- if monitoring.node_stats is defined %}
+ gather_admin_socket_stats: {{ monitoring.node_stats.gather_admin_socket_stats|default('true') }}
+ gather_cluster_stats: {{ monitoring.node_stats.gather_cluster_stats|default('false') }}
{%- if monitoring.node_stats.socket_dir is defined %}
socket_dir: {{ monitoring.node_stats.socket_dir }}
{%- endif %}
@@ -34,10 +77,11 @@
{%- if monitoring.node_stats.osd_prefix is defined %}
osd_prefix: {{ monitoring.node_stats.osd_prefix }}
{%- endif %}
- gather_admin_socket_stats: true
- gather_cluster_stats: false
- gather_pool_loads: false
{%- if monitoring.interval is defined %}
interval: {{ monitoring.interval }}
{%- endif %}
+{%- else %}
+ gather_admin_socket_stats: true
+ gather_cluster_stats: false
+{%- endif %}
{%- endif %}