alerts definition for ceph mgr prometheus plugin
target definition for ceph exporter
Change-Id: Ice4720eeba79c3d735df2df74242c11fa0fb2e5c
Related-Prod: PROD-23022
diff --git a/ceph/meta/prometheus.yml b/ceph/meta/prometheus.yml
index a8755ff..f91a4f2 100644
--- a/ceph/meta/prometheus.yml
+++ b/ceph/meta/prometheus.yml
@@ -1,4 +1,4 @@
-{%- from "ceph/map.jinja" import thresholds, mon, monitoring, setup with context %}
+{%- from "ceph/map.jinja" import thresholds, mon, monitoring, setup, osd with context %}
{%- if (mon is defined and mon.get('enabled')) or (monitoring.cluster_stats.get('enabled') and monitoring.cluster_stats.ceph_user is defined) %}
{% raw %}
@@ -6,7 +6,7 @@
alert:
CephClusterHealthMinor:
if: >-
- ceph_overall_health == 2
+ ceph_health_status == 1
for: 3m
labels:
severity: minor
@@ -16,7 +16,7 @@
description: "The Ceph cluster is in the WARNING state. For details, run 'ceph -s'."
CephClusterHealthCritical:
if: >-
- ceph_overall_health == 3
+ ceph_health_status == 2
for: 3m
labels:
severity: critical
@@ -26,29 +26,29 @@
description: "The Ceph cluster is in the CRITICAL state. For details, run 'ceph -s'."
CephMonitorDownMinor:
if: >-
- 100 * (1 - ceph_num_mon_quorum / ceph_num_mon) > 0
+ count(ceph_mon_quorum_status) - sum(ceph_mon_quorum_status) > 0
for: 3m
labels:
severity: minor
service: ceph
annotations:
summary: "Ceph Monitors are down"
- description: "{{ $value }}% of Ceph Monitors are down. For details, run 'ceph -s'."
+ description: "{{ $value }} of Ceph Monitors are down. For details, run 'ceph -s'."
CephOsdDownMinor:
if: >-
- 100 * (1 - ceph_osdmap_num_up_osds / ceph_osdmap_num_osds) > 0
+ count(ceph_osd_up) - sum(ceph_osd_up) > 0
for: 3m
labels:
severity: minor
service: ceph
annotations:
summary: "Ceph OSDs are down"
- description: "{{ $value }}% of Ceph OSDs are down. For details, run 'ceph osd tree'."
+ description: "{{ $value }} of Ceph OSDs are down. For details, run 'ceph osd tree'."
CephOsdSpaceUsageWarning:
{%- endraw %}
{%- set threshold = monitoring.space_used_warning_threshold|default('0.75')|float %}
if: >-
- ceph_osd_bytes_used > ceph_osd_bytes * {{threshold}}
+ ceph_cluster_total_used_bytes > ceph_cluster_total_bytes * {{threshold}}
{%- raw %}
for: 3m
labels:
@@ -61,7 +61,7 @@
{%- endraw %}
{%- set threshold = monitoring.space_used_critical_threshold|default('0.85')|float %}
if: >-
- ceph_osd_bytes_used > ceph_osd_bytes * {{threshold}}
+ ceph_cluster_total_used_bytes > ceph_cluster_total_bytes * {{threshold}}
{%- raw %}
for: 3m
labels:
@@ -81,7 +81,7 @@
CephPool{{pool_name|replace(".", "")|replace("-", "")}}SpaceUsageWarning:
{%- set threshold = monitoring_pool.pool_space_used_utilization_warning_threshold|default('0.75')|float %}
if: >-
- ceph_pool_usage_bytes_used{name="{{pool_name}}"} / (ceph_pool_usage_max_avail{name="{{pool_name}}"} + ceph_pool_usage_bytes_used{name="{{pool_name}}"}) > {{threshold}}
+ ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > {{threshold}}
for: 3m
labels:
severity: warning
@@ -92,7 +92,7 @@
CephPool{{pool_name|replace(".", "")|replace("-", "")}}SpaceUsageCritical:
{%- set threshold = monitoring_pool.pool_space_used_critical_threshold|default('0.85')|float %}
if: >-
- ceph_pool_usage_bytes_used{name="{{pool_name}}"} / (ceph_pool_usage_max_avail{name="{{pool_name}}"} + ceph_pool_usage_bytes_used{name="{{pool_name}}"}) > {{threshold}}
+ ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > {{threshold}}
for: 3m
labels:
severity: minor
@@ -100,52 +100,29 @@
annotations:
summary: "{{100*threshold}}% of Ceph pool space is used"
description: "The Ceph {{pool_name}} pool uses {{100*threshold}}% of available space for 3 minutes. For details, run 'ceph df'."
- {%- if monitoring.cluster_stats.extra_alerts is defined and monitoring.cluster_stats.extra_alerts.get("enabled", False) %}
- CephPool{{pool_name|replace(".", "")|replace("-", "")}}WriteOpsTooHigh:
- {%- set threshold = monitoring_pool.pool_write_ops_threshold|default('200')|float %}
- if: >-
- ceph_pool_stats_write_op_per_sec{name="{{pool_name}}"} > {{threshold}}
- for: 3m
- labels:
- severity: warning
- service: ceph
- annotations:
- summary: "{{threshold}} Ceph pool write operations per second"
- description: "The number of Ceph {{pool_name}} pool write operations per second is {{threshold}} for 3 minutes."
- CephPool{{pool_name|replace(".", "")|replace("-", "")}}WriteBytesTooHigh:
- {%- set threshold = monitoring_pool.pool_write_bytes_threshold|default('70000000')|float %}
- if: >-
- ceph_pool_stats_write_bytes_sec{name="{{pool_name}}"} > {{threshold}}
- for: 3m
- labels:
- severity: warning
- service: ceph
- annotations:
- summary: "{{threshold}} Ceph pool write bytes per second"
- description: "The number of Ceph {{pool_name}} pool write bytes per second is {{threshold}} for 3 minutes."
- CephPool{{pool_name|replace(".", "")|replace("-", "")}}ReadOpsTooHigh:
- {%- set threshold = monitoring_pool.pool_read_ops_threshold|default('1000')|float %}
- if: >-
- ceph_pool_stats_read_op_per_sec{name="{{pool_name}}"} > {{threshold}}
- for: 3m
- labels:
- severity: warning
- service: ceph
- annotations:
- summary: "{{threshold}} Ceph pool read operations per second"
- description: "The number of Ceph {{pool_name}} pool read operations per second is {{threshold}} for 3 minutes."
- CephPool{{pool_name|replace(".", "")|replace("-", "")}}ReadBytesTooHigh:
- {%- set threshold = monitoring_pool.pool_read_bytes_threshold|default('70000000')|float %}
- if: >-
- ceph_pool_stats_read_bytes_sec{name="{{pool_name}}"} > {{threshold}}
- for: 3m
- labels:
- severity: warning
- service: ceph
- annotations:
- summary: "{{threshold}} Ceph pool read bytes per second"
- description: "The number of Ceph {{pool_name}} pool read bytes per second is {{threshold}} for 3 minutes."
- {%- endif %}
{%- endfor %}
{%- endif %}
{%- endif %}
+
+
+{%- if mon is defined and mon.get('enabled') %}
+{%- set fqdn_ip4_addresses = [] %}
+{%- for addr in grains['fqdn_ip4'] %}
+ {%- if not addr.startswith('127.') %}
+ {%- do fqdn_ip4_addresses.append(addr) %}
+ {%- endif %}
+{%- endfor %}
+{%- set address = fqdn_ip4_addresses[0] %}
+{%- if address is defined %}
+server:
+ target:
+ static:
+ ceph:
+ enabled: true
+ endpoint:
+ - address: {{ address }}
+ port: 9283
+ honor_labels: true
+
+{%- endif %}
+{%- endif %}