additional ceph alerts for cluster state prediction Change-Id: I49aba5fbd3acededee7040b9380e663c5ee9c382 Related-Prod: PROD-27983

commit: c613bb812ac51e62b7862f53857ab31f1412e157 [log] [tgz]
author: Mateusz Los <mlos@mirantis.com> Wed Mar 13 11:27:40 2019 +0100
committer: Mateusz Los <mlos@mirantis.com> Fri Mar 22 13:33:03 2019 +0100
tree: a43cd8d00d695f200c119b62457dc1e0154a8418
parent: e8882e81e9ec4502ce9487188c5f42054886fe47 [diff]
diff --git a/ceph/map.jinja b/ceph/map.jinja
index b2fde96..9533c09 100644
--- a/ceph/map.jinja
+++ b/ceph/map.jinja

@@ -6,6 +6,7 @@
   container_mode: False
   prefix_dir: ''
   services: []
+  enable_prediction: False
 {%- endload %}
 {% set common = salt['grains.filter_by'](common_defaults, merge=salt['pillar.get']('ceph:common')) %}
 
@@ -89,6 +90,14 @@
   node_stats: {}
   osd_pgnum_warning: 200
   osd_pgnum_critical: 300
+  prediction_threshold: 7
+  write_latency_threshold: 1.5
+  read_latency_threshold: 1.5
+  pool_iops_limit: 100
+  pool_iops_threshold: 2
+  osd_iops_limit: 100
+  osd_iops_threshold: 3
+  space_threshold: 0.85
 {%- endload %}
 
 {% set monitoring = salt['grains.filter_by'](monitoring_defaults, merge=salt['pillar.get']('ceph:monitoring')) %}

diff --git a/ceph/meta/prometheus.yml b/ceph/meta/prometheus.yml
index d29c409..0c9da93 100644
--- a/ceph/meta/prometheus.yml
+++ b/ceph/meta/prometheus.yml

@@ -158,6 +158,19 @@
     {%- if mon is defined and mon.get('enabled') %}
       {%- raw %}
 server:
+  recording:
+    ceph_osd_op:rate5m:
+      query: >-
+        rate(ceph_osd_op[5m])
+    ceph_pool_ops:rate5m:
+      query: >-
+        rate(ceph_pool_rd[5m]) + rate(ceph_pool_wr[5m])
+    diskio_write_time:rate5m:
+      query: >-
+        rate(diskio_write_time[5m])
+    diskio_read_time:rate5m:
+      query: >-
+        rate(diskio_read_time[5m])
   alert:
     CephClusterHealthMinor:
       if: >-
@@ -252,6 +265,77 @@
         summary: "Some OSDs have more than {% endraw %}{{threshold}}{% raw %} PGs"
         description: "Some OSDs contain more than {% endraw %}{{threshold}}{% raw %} PGs. This may have a negative impact on the cluster performance. For details, run 'ceph pg dump'"
       {%- endraw %}
+      {%- if common.get('enable_prediction', False) %}
+    CephPredictOsdIOPSthreshold:
+      {%- set threshold = monitoring.prediction_threshold %}
+      {%- set osd_iops_limit = monitoring.osd_iops_limit %}
+      if: >-
+        predict_linear(ceph_osd_op:rate5m[{{threshold}}d], {{threshold}} * 86400)  > {{osd_iops_limit}}
+      {%- raw %}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily"
+        description: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily."
+    CephPredictOsdIOPSauto:
+      {%- endraw %}
+      {%- set threshold = monitoring.prediction_threshold %}
+      {%- set iops_threshold = monitoring.osd_iops_threshold %}
+      if: >-
+        predict_linear(ceph_osd_op:rate5m[{{threshold}}d], {{threshold}} * 86400)  > avg_over_time(ceph_osd_op:rate5m[1d]) * {{ iops_threshold }}
+      {%- raw %}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily"
+        description: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily."
+    CephPredictUsageRAM:
+      {%- endraw %}
+      {%- set threshold = monitoring.prediction_threshold %}
+      if: >-
+        predict_linear(mem_free{host=~"cmn.*|rgw.*|osd.*"}[{{threshold}}d], {{threshold}} * 86400) < 0
+      {%- raw %}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "{{$labels.host}} might exhaust available ram in next week"
+        description: "{{$labels.host}} might exhaust available ram in next week."
+    CephPredictOsdWriteLatency:
+      {%- endraw %}
+      {%- set threshold = monitoring.prediction_threshold %}
+      {%- set write_latency_threshold = monitoring.write_latency_threshold %}
+      if: >-
+        predict_linear(diskio_write_time:rate5m{host=~"osd.*",name=~"sd[b-z]*"}[{{threshold}}d], {{threshold}} * 86400) > avg_over_time(diskio_write_time:rate5m[1d]) * {{write_latency_threshold}}
+      {%- raw %}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time"
+        description: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time. You can verify OSDs with top load using grafana OSD overview page"
+    CephPredictOsdReadLatency:
+      {%- endraw %}
+      {%- set threshold = monitoring.prediction_threshold %}
+      {%- set read_latency_threshold = monitoring.read_latency_threshold %}
+      if: >-
+        predict_linear(diskio_read_time:rate5m{host=~"osd.*",name=~"sd[b-z]*"}[{{threshold}}d], {{threshold}} * 86400) > avg_over_time(diskio_read_time:rate5m[1d]) * {{read_latency_threshold}}
+      {%- raw %}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time"
+        description: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time. You can verify OSDs with top load using grafana OSD overview page"
+      {%- endraw %}
+      {%- endif %}
       {%- if setup.pool is defined %}
         {%- for pool_name, pool in setup.pool.iteritems() %}
           {%- if monitoring.pool is defined and monitoring.pool[pool_name] is defined %}
@@ -281,6 +365,44 @@
       annotations:
         summary: "{{100*threshold}}% of Ceph pool space is used"
         description: "The Ceph {{pool_name}} pool uses {{100*threshold}}% of available space for 3 minutes. For details, run 'ceph df'."
+          {%- if common.get('enable_prediction', False) %}
+    CephPredictPool{{pool_name|replace(".", "")|replace("-", "")}}Space:
+      {%- set threshold = monitoring.prediction_threshold %}
+      {%- set space_threshold = monitoring_pool.space_threshold %}
+      if: >-
+        predict_linear(ceph_pool_bytes_used[{{threshold}}d], {{threshold}} * 86400) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > (ceph_pool_bytes_used + ceph_pool_max_avail) * {{space_threshold}} * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "{{pool_name}} may exhaust more than {{100*space_threshold}}% of available capacity in one week"
+        description: "{{pool_name}} may exhaust more than {{100*space_threshold}}% of available capacity in one week. Consult ceph df and plan proper actions"
+    CephPredictPool{{pool_name|replace(".", "")|replace("-", "")}}IOPSthreshold:
+      {%- set threshold = monitoring.prediction_threshold %}
+      {%- set iops_limit = monitoring_pool.pool_iops_limit %}
+      if: >-
+        predict_linear(ceph_pool_ops:rate5m[{{threshold}}d], {{threshold}} * 86400) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > {{ iops_limit }}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "IOPS on {{pool_name}} are increasing rapdily"
+        description: "IOPS on {{pool_name}} are increasing rapdily."
+    CephPredictPool{{pool_name|replace(".", "")|replace("-", "")}}IOPSauto:
+      {%- set threshold = monitoring.prediction_threshold %}
+      {%- set iops_threshold = monitoring_pool.pool_iops_threshold %}
+      if: >-
+        predict_linear(ceph_pool_ops:rate5m[{{threshold}}d], {{threshold}} * 86400) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > avg_over_time(ceph_pool_ops:rate5m[1d]) * {{ iops_threshold }}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "IOPS on {{pool_name}} are increasing rapdily"
+        description: "IOPS on {{pool_name}} are increasing rapdily."
+          {%- endif -%}
         {%- endfor %}
       {%- endif -%}
commit	c613bb812ac51e62b7862f53857ab31f1412e157	[log] [tgz]
author	Mateusz Los <mlos@mirantis.com>	Wed Mar 13 11:27:40 2019 +0100
committer	Mateusz Los <mlos@mirantis.com>	Fri Mar 22 13:33:03 2019 +0100
tree	a43cd8d00d695f200c119b62457dc1e0154a8418
parent	e8882e81e9ec4502ce9487188c5f42054886fe47 [diff]