additional ceph alerts for cluster state prediction
Change-Id: I49aba5fbd3acededee7040b9380e663c5ee9c382
Related-Prod: PROD-27983
diff --git a/ceph/map.jinja b/ceph/map.jinja
index b2fde96..9533c09 100644
--- a/ceph/map.jinja
+++ b/ceph/map.jinja
@@ -6,6 +6,7 @@
container_mode: False
prefix_dir: ''
services: []
+ enable_prediction: False
{%- endload %}
{% set common = salt['grains.filter_by'](common_defaults, merge=salt['pillar.get']('ceph:common')) %}
@@ -89,6 +90,14 @@
node_stats: {}
osd_pgnum_warning: 200
osd_pgnum_critical: 300
+ prediction_threshold: 7
+ write_latency_threshold: 1.5
+ read_latency_threshold: 1.5
+ pool_iops_limit: 100
+ pool_iops_threshold: 2
+ osd_iops_limit: 100
+ osd_iops_threshold: 3
+ space_threshold: 0.85
{%- endload %}
{% set monitoring = salt['grains.filter_by'](monitoring_defaults, merge=salt['pillar.get']('ceph:monitoring')) %}
diff --git a/ceph/meta/prometheus.yml b/ceph/meta/prometheus.yml
index d29c409..0c9da93 100644
--- a/ceph/meta/prometheus.yml
+++ b/ceph/meta/prometheus.yml
@@ -158,6 +158,19 @@
{%- if mon is defined and mon.get('enabled') %}
{%- raw %}
server:
+ recording:
+ ceph_osd_op:rate5m:
+ query: >-
+ rate(ceph_osd_op[5m])
+ ceph_pool_ops:rate5m:
+ query: >-
+ rate(ceph_pool_rd[5m]) + rate(ceph_pool_wr[5m])
+ diskio_write_time:rate5m:
+ query: >-
+ rate(diskio_write_time[5m])
+ diskio_read_time:rate5m:
+ query: >-
+ rate(diskio_read_time[5m])
alert:
CephClusterHealthMinor:
if: >-
@@ -252,6 +265,77 @@
summary: "Some OSDs have more than {% endraw %}{{threshold}}{% raw %} PGs"
description: "Some OSDs contain more than {% endraw %}{{threshold}}{% raw %} PGs. This may have a negative impact on the cluster performance. For details, run 'ceph pg dump'"
{%- endraw %}
+ {%- if common.get('enable_prediction', False) %}
+ CephPredictOsdIOPSthreshold:
+ {%- set threshold = monitoring.prediction_threshold %}
+ {%- set osd_iops_limit = monitoring.osd_iops_limit %}
+ if: >-
+ predict_linear(ceph_osd_op:rate5m[{{threshold}}d], {{threshold}} * 86400) > {{osd_iops_limit}}
+ {%- raw %}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily"
+ description: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily."
+ CephPredictOsdIOPSauto:
+ {%- endraw %}
+ {%- set threshold = monitoring.prediction_threshold %}
+ {%- set iops_threshold = monitoring.osd_iops_threshold %}
+ if: >-
+ predict_linear(ceph_osd_op:rate5m[{{threshold}}d], {{threshold}} * 86400) > avg_over_time(ceph_osd_op:rate5m[1d]) * {{ iops_threshold }}
+ {%- raw %}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily"
+ description: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily."
+ CephPredictUsageRAM:
+ {%- endraw %}
+ {%- set threshold = monitoring.prediction_threshold %}
+ if: >-
+ predict_linear(mem_free{host=~"cmn.*|rgw.*|osd.*"}[{{threshold}}d], {{threshold}} * 86400) < 0
+ {%- raw %}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "{{$labels.host}} might exhaust available ram in next week"
+ description: "{{$labels.host}} might exhaust available ram in next week."
+ CephPredictOsdWriteLatency:
+ {%- endraw %}
+ {%- set threshold = monitoring.prediction_threshold %}
+ {%- set write_latency_threshold = monitoring.write_latency_threshold %}
+ if: >-
+ predict_linear(diskio_write_time:rate5m{host=~"osd.*",name=~"sd[b-z]*"}[{{threshold}}d], {{threshold}} * 86400) > avg_over_time(diskio_write_time:rate5m[1d]) * {{write_latency_threshold}}
+ {%- raw %}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time"
+ description: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time. You can verify OSDs with top load using grafana OSD overview page"
+ CephPredictOsdReadLatency:
+ {%- endraw %}
+ {%- set threshold = monitoring.prediction_threshold %}
+ {%- set read_latency_threshold = monitoring.read_latency_threshold %}
+ if: >-
+ predict_linear(diskio_read_time:rate5m{host=~"osd.*",name=~"sd[b-z]*"}[{{threshold}}d], {{threshold}} * 86400) > avg_over_time(diskio_read_time:rate5m[1d]) * {{read_latency_threshold}}
+ {%- raw %}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time"
+ description: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time. You can verify OSDs with top load using grafana OSD overview page"
+ {%- endraw %}
+ {%- endif %}
{%- if setup.pool is defined %}
{%- for pool_name, pool in setup.pool.iteritems() %}
{%- if monitoring.pool is defined and monitoring.pool[pool_name] is defined %}
@@ -281,6 +365,44 @@
annotations:
summary: "{{100*threshold}}% of Ceph pool space is used"
description: "The Ceph {{pool_name}} pool uses {{100*threshold}}% of available space for 3 minutes. For details, run 'ceph df'."
+ {%- if common.get('enable_prediction', False) %}
+ CephPredictPool{{pool_name|replace(".", "")|replace("-", "")}}Space:
+ {%- set threshold = monitoring.prediction_threshold %}
+ {%- set space_threshold = monitoring_pool.space_threshold %}
+ if: >-
+ predict_linear(ceph_pool_bytes_used[{{threshold}}d], {{threshold}} * 86400) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > (ceph_pool_bytes_used + ceph_pool_max_avail) * {{space_threshold}} * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "{{pool_name}} may exhaust more than {{100*space_threshold}}% of available capacity in one week"
+ description: "{{pool_name}} may exhaust more than {{100*space_threshold}}% of available capacity in one week. Consult ceph df and plan proper actions"
+ CephPredictPool{{pool_name|replace(".", "")|replace("-", "")}}IOPSthreshold:
+ {%- set threshold = monitoring.prediction_threshold %}
+ {%- set iops_limit = monitoring_pool.pool_iops_limit %}
+ if: >-
+ predict_linear(ceph_pool_ops:rate5m[{{threshold}}d], {{threshold}} * 86400) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > {{ iops_limit }}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "IOPS on {{pool_name}} are increasing rapdily"
+ description: "IOPS on {{pool_name}} are increasing rapdily."
+ CephPredictPool{{pool_name|replace(".", "")|replace("-", "")}}IOPSauto:
+ {%- set threshold = monitoring.prediction_threshold %}
+ {%- set iops_threshold = monitoring_pool.pool_iops_threshold %}
+ if: >-
+ predict_linear(ceph_pool_ops:rate5m[{{threshold}}d], {{threshold}} * 86400) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > avg_over_time(ceph_pool_ops:rate5m[1d]) * {{ iops_threshold }}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "IOPS on {{pool_name}} are increasing rapdily"
+ description: "IOPS on {{pool_name}} are increasing rapdily."
+ {%- endif -%}
{%- endfor %}
{%- endif -%}