| {%- from "ceph/map.jinja" import thresholds, mon, monitoring, setup with context %} |
| |
| {%- if (mon is defined and mon.get('enabled')) or (monitoring.cluster_stats.get('enabled') and monitoring.cluster_stats.ceph_user is defined) %} |
| {% raw %} |
| server: |
| alert: |
| CephClusterHealthMinor: |
| if: >- |
| ceph_overall_health == 2 |
| for: 3m |
| labels: |
| severity: minor |
| service: ceph |
| annotations: |
| summary: "Ceph cluster health is WARNING" |
| description: "The Ceph cluster is in the WARNING state. For details, run 'ceph -s'." |
| CephClusterHealthCritical: |
| if: >- |
| ceph_overall_health == 3 |
| for: 3m |
| labels: |
| severity: critical |
| service: ceph |
| annotations: |
| summary: "Ceph cluster health is CRITICAL" |
| description: "The Ceph cluster is in the CRITICAL state. For details, run 'ceph -s'." |
| CephMonitorDownMinor: |
| if: >- |
| 100 * (1 - ceph_num_mon_quorum / ceph_num_mon) > 0 |
| for: 3m |
| labels: |
| severity: minor |
| service: ceph |
| annotations: |
| summary: "Ceph Monitors are down" |
| description: "{{ $value }}% of Ceph Monitors are down. For details, run 'ceph -s'." |
| CephOsdDownMinor: |
| if: >- |
| 100 * (1 - ceph_osdmap_num_up_osds / ceph_osdmap_num_osds) > 0 |
| for: 3m |
| labels: |
| severity: minor |
| service: ceph |
| annotations: |
| summary: "Ceph OSDs are down" |
| description: "{{ $value }}% of Ceph OSDs are down. For details, run 'ceph osd tree'." |
| CephOsdSpaceUsageWarning: |
| {%- endraw %} |
| {%- set threshold = monitoring.space_used_warning_threshold|default('0.75')|float %} |
| if: >- |
| ceph_osd_bytes_used > ceph_osd_bytes * {{threshold}} |
| {%- raw %} |
| for: 3m |
| labels: |
| severity: warning |
| service: ceph |
| annotations: |
| summary: "{%-endraw %}{{100*threshold}}{%- raw %}% of Ceph space is used" |
| description: "{{ $value }} bytes of Ceph OSD space (>= {%-endraw %}{{100*threshold}}{%- raw %}%) is used for 3 minutes. For details, run 'ceph df'." |
| CephOsdSpaceUsageMajor: |
| {%- endraw %} |
| {%- set threshold = monitoring.space_used_critical_threshold|default('0.85')|float %} |
| if: >- |
| ceph_osd_bytes_used > ceph_osd_bytes * {{threshold}} |
| {%- raw %} |
| for: 3m |
| labels: |
| severity: major |
| service: ceph |
| annotations: |
| summary: "{%-endraw %}{{100*threshold}}{%- raw %}% of Ceph space is used" |
| description: "{{ $ value }} bytes of Ceph OSD space (>= {%-endraw %}{{100*threshold}}{%- raw %}%) is used for 3 minutes. For details, run 'ceph df'." |
| {% endraw %} |
| {%- if setup.pool is defined %} |
| {%- for pool_name, pool in setup.pool.iteritems() %} |
| {%- if monitoring.pool is defined and monitoring.pool[pool_name] is defined %} |
| {%- set monitoring_pool = monitoring.pool[pool_name] %} |
| {%- else %} |
| {%- set monitoring_pool = monitoring %} |
| {%- endif %} |
| CephPool{{pool_name|replace(".", "")|replace("-", "")}}SpaceUsageWarning: |
| {%- set threshold = monitoring_pool.pool_space_used_utilization_warning_threshold|default('0.75')|float %} |
| if: >- |
| ceph_pool_usage_bytes_used{name="{{pool_name}}"} / (ceph_pool_usage_max_avail{name="{{pool_name}}"} + ceph_pool_usage_bytes_used{name="{{pool_name}}"}) > {{threshold}} |
| for: 3m |
| labels: |
| severity: warning |
| service: ceph |
| annotations: |
| summary: "{{100*threshold}}% of Ceph pool space is used" |
| description: "The Ceph {{pool_name}} pool uses {{100*threshold}}% of available space for 3 minutes. For details, run 'ceph df'." |
| CephPool{{pool_name|replace(".", "")|replace("-", "")}}SpaceUsageCritical: |
| {%- set threshold = monitoring_pool.pool_space_used_critical_threshold|default('0.85')|float %} |
| if: >- |
| ceph_pool_usage_bytes_used{name="{{pool_name}}"} / (ceph_pool_usage_max_avail{name="{{pool_name}}"} + ceph_pool_usage_bytes_used{name="{{pool_name}}"}) > {{threshold}} |
| for: 3m |
| labels: |
| severity: minor |
| service: ceph |
| annotations: |
| summary: "{{100*threshold}}% of Ceph pool space is used" |
| description: "The Ceph {{pool_name}} pool uses {{100*threshold}}% of available space for 3 minutes. For details, run 'ceph df'." |
| {%- if monitoring.cluster_stats.extra_alerts.get("enabled", False) %} |
| CephPool{{pool_name|replace(".", "")|replace("-", "")}}WriteOpsTooHigh: |
| {%- set threshold = monitoring_pool.pool_write_ops_threshold|default('200')|float %} |
| if: >- |
| ceph_pool_stats_write_op_per_sec{name="{{pool_name}}"} > {{threshold}} |
| for: 3m |
| labels: |
| severity: warning |
| service: ceph |
| annotations: |
| summary: "{{threshold}} Ceph pool write operations per second" |
| description: "The number of Ceph {{pool_name}} pool write operations per second is {{threshold}} for 3 minutes." |
| CephPool{{pool_name|replace(".", "")|replace("-", "")}}WriteBytesTooHigh: |
| {%- set threshold = monitoring_pool.pool_write_bytes_threshold|default('70000000')|float %} |
| if: >- |
| ceph_pool_stats_write_bytes_sec{name="{{pool_name}}"} > {{threshold}} |
| for: 3m |
| labels: |
| severity: warning |
| service: ceph |
| annotations: |
| summary: "{{threshold}} Ceph pool write bytes per second" |
| description: "The number of Ceph {{pool_name}} pool write bytes per second is {{threshold}} for 3 minutes." |
| CephPool{{pool_name|replace(".", "")|replace("-", "")}}ReadOpsTooHigh: |
| {%- set threshold = monitoring_pool.pool_read_ops_threshold|default('1000')|float %} |
| if: >- |
| ceph_pool_stats_read_op_per_sec{name="{{pool_name}}"} > {{threshold}} |
| for: 3m |
| labels: |
| severity: warning |
| service: ceph |
| annotations: |
| summary: "{{threshold}} Ceph pool read operations per second" |
| description: "The number of Ceph {{pool_name}} pool read operations per second is {{threshold}} for 3 minutes." |
| CephPool{{pool_name|replace(".", "")|replace("-", "")}}ReadBytesTooHigh: |
| {%- set threshold = monitoring_pool.pool_read_bytes_threshold|default('70000000')|float %} |
| if: >- |
| ceph_pool_stats_read_bytes_sec{name="{{pool_name}}"} > {{threshold}} |
| for: 3m |
| labels: |
| severity: warning |
| service: ceph |
| annotations: |
| summary: "{{threshold}} Ceph pool read bytes per second" |
| description: "The number of Ceph {{pool_name}} pool read bytes per second is {{threshold}} for 3 minutes." |
| {%- endif %} |
| {%- endfor %} |
| {%- endif %} |
| {%- endif %} |