Merge "Set default interval to 40s when Jewel Ceph in use" into release/2019.2.0

commit: 18fc7a4678d70d3f6be85b6e99a391aedc59eda7 [log] [tgz]
author: mcp-jenkins <mcp-jenkins@mirantis.com> Fri Mar 22 18:20:33 2019 +0000
committer: Gerrit Code Review <mail@domain.com> Fri Mar 22 18:20:33 2019 +0000
tree: 115005f1b9ced5671a4d6f962569b4321c0c008c
parent: 31030c3f481fc997af66df2ee42adb8535a95d31 [diff]
parent: 829f26a8f98671fd562771e1c438f10104d30b04 [diff]
diff --git a/ceph/files/grafana_dashboards/legacy/ceph_cluster_prometheus.json b/ceph/files/grafana_dashboards/legacy/ceph_cluster_prometheus.json
index eb1fe5c..211509e 100644
--- a/ceph/files/grafana_dashboards/legacy/ceph_cluster_prometheus.json
+++ b/ceph/files/grafana_dashboards/legacy/ceph_cluster_prometheus.json

@@ -1328,7 +1328,7 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "ceph_pgmap_write_op_per_sec or absent(ceph_pgmap_write_op_per_sec) - 1",
+              "expr": "avg(ceph_pgmap_write_op_per_sec)",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,
@@ -1337,7 +1337,7 @@
               "step": 60
             },
             {
-              "expr": "ceph_pgmap_read_op_per_sec or absent(ceph_pgmap_read_op_per_sec) - 1",
+              "expr": "avg(ceph_pgmap_read_op_per_sec)",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,
@@ -1421,7 +1421,7 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "ceph_pgmap_write_bytes_sec or absent(ceph_pgmap_write_bytes_sec) - 1 ",
+              "expr": "avg(ceph_pgmap_write_bytes_sec)",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,
@@ -1430,7 +1430,7 @@
               "step": 60
             },
             {
-              "expr": "ceph_pgmap_read_bytes_sec or absent(ceph_pgmap_read_bytes_sec) - 1",
+              "expr": "avg(ceph_pgmap_read_bytes_sec)",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,
@@ -1637,7 +1637,7 @@
           "linewidth": 0,
           "links": [],
           "minSpan": null,
-          "nullPointMode": "connected",
+          "nullPointMode": "null as zero",
           "percentage": false,
           "pointradius": 5,
           "points": false,
@@ -1656,7 +1656,7 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "ceph_usage_total_bytes - ceph_usage_total_used_bytes",
+              "expr": "avg(ceph_usage_total_bytes - ceph_usage_total_used_bytes)",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,
@@ -1665,7 +1665,7 @@
               "step": 60
             },
             {
-              "expr": "ceph_usage_total_used_bytes or absent(ceph_usage_total_used_bytes) - 1",
+              "expr": "avg(ceph_usage_total_used_bytes)",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,
@@ -1674,7 +1674,7 @@
               "step": 60
             },
             {
-              "expr": "ceph_usage_total_bytes or absent(ceph_usage_total_bytes) - 1",
+              "expr": "avg(ceph_usage_total_bytes)",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,

diff --git a/ceph/files/grafana_dashboards/legacy/ceph_pools_prometheus.json b/ceph/files/grafana_dashboards/legacy/ceph_pools_prometheus.json
index a669b33..1c402e0 100644
--- a/ceph/files/grafana_dashboards/legacy/ceph_pools_prometheus.json
+++ b/ceph/files/grafana_dashboards/legacy/ceph_pools_prometheus.json

@@ -437,7 +437,7 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "avg(irate(ceph_pool_stats_read_op_per_sec{name=\"$pool\"}[3m])) or absent(avg(irate(ceph_pool_stats_read_op_per_sec{name=\"$pool\"}[3m]))) - 1",
+              "expr": "avg(irate(ceph_pool_stats_read_op_per_sec{name=\"$pool\"}[3m]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,
@@ -446,7 +446,7 @@
               "step": 60
             },
             {
-              "expr": "avg(irate(ceph_pool_stats_write_op_per_sec{name=\"$pool\"}[3m])) or absent(avg(irate(ceph_pool_stats_write_op_per_sec{name=\"$pool\"}[3m]))) - 1",
+              "expr": "avg(irate(ceph_pool_stats_write_op_per_sec{name=\"$pool\"}[3m]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,
@@ -530,7 +530,7 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "avg(irate(ceph_pool_stats_read_bytes_sec{name=\"$pool\"}[3m])) or absent(irate(ceph_pool_stats_read_bytes_sec{name=\"$pool\"}[3m])) - 1",
+              "expr": "avg(irate(ceph_pool_stats_read_bytes_sec{name=\"$pool\"}[3m]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,
@@ -539,7 +539,7 @@
               "step": 60
             },
             {
-              "expr": "avg(irate(ceph_pool_stats_write_bytes_sec{name=\"$pool\"}[3m])) or absent(irate(ceph_pool_stats_write_bytes_sec{name=\"$pool\"}[3m])) - 1 ",
+              "expr": "avg(irate(ceph_pool_stats_write_bytes_sec{name=\"$pool\"}[3m]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,

diff --git a/ceph/map.jinja b/ceph/map.jinja
index b2fde96..9533c09 100644
--- a/ceph/map.jinja
+++ b/ceph/map.jinja

@@ -6,6 +6,7 @@
   container_mode: False
   prefix_dir: ''
   services: []
+  enable_prediction: False
 {%- endload %}
 {% set common = salt['grains.filter_by'](common_defaults, merge=salt['pillar.get']('ceph:common')) %}
 
@@ -89,6 +90,14 @@
   node_stats: {}
   osd_pgnum_warning: 200
   osd_pgnum_critical: 300
+  prediction_threshold: 7
+  write_latency_threshold: 1.5
+  read_latency_threshold: 1.5
+  pool_iops_limit: 100
+  pool_iops_threshold: 2
+  osd_iops_limit: 100
+  osd_iops_threshold: 3
+  space_threshold: 0.85
 {%- endload %}
 
 {% set monitoring = salt['grains.filter_by'](monitoring_defaults, merge=salt['pillar.get']('ceph:monitoring')) %}

diff --git a/ceph/meta/prometheus.yml b/ceph/meta/prometheus.yml
index d29c409..0c9da93 100644
--- a/ceph/meta/prometheus.yml
+++ b/ceph/meta/prometheus.yml

@@ -158,6 +158,19 @@
     {%- if mon is defined and mon.get('enabled') %}
       {%- raw %}
 server:
+  recording:
+    ceph_osd_op:rate5m:
+      query: >-
+        rate(ceph_osd_op[5m])
+    ceph_pool_ops:rate5m:
+      query: >-
+        rate(ceph_pool_rd[5m]) + rate(ceph_pool_wr[5m])
+    diskio_write_time:rate5m:
+      query: >-
+        rate(diskio_write_time[5m])
+    diskio_read_time:rate5m:
+      query: >-
+        rate(diskio_read_time[5m])
   alert:
     CephClusterHealthMinor:
       if: >-
@@ -252,6 +265,77 @@
         summary: "Some OSDs have more than {% endraw %}{{threshold}}{% raw %} PGs"
         description: "Some OSDs contain more than {% endraw %}{{threshold}}{% raw %} PGs. This may have a negative impact on the cluster performance. For details, run 'ceph pg dump'"
       {%- endraw %}
+      {%- if common.get('enable_prediction', False) %}
+    CephPredictOsdIOPSthreshold:
+      {%- set threshold = monitoring.prediction_threshold %}
+      {%- set osd_iops_limit = monitoring.osd_iops_limit %}
+      if: >-
+        predict_linear(ceph_osd_op:rate5m[{{threshold}}d], {{threshold}} * 86400)  > {{osd_iops_limit}}
+      {%- raw %}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily"
+        description: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily."
+    CephPredictOsdIOPSauto:
+      {%- endraw %}
+      {%- set threshold = monitoring.prediction_threshold %}
+      {%- set iops_threshold = monitoring.osd_iops_threshold %}
+      if: >-
+        predict_linear(ceph_osd_op:rate5m[{{threshold}}d], {{threshold}} * 86400)  > avg_over_time(ceph_osd_op:rate5m[1d]) * {{ iops_threshold }}
+      {%- raw %}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily"
+        description: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily."
+    CephPredictUsageRAM:
+      {%- endraw %}
+      {%- set threshold = monitoring.prediction_threshold %}
+      if: >-
+        predict_linear(mem_free{host=~"cmn.*|rgw.*|osd.*"}[{{threshold}}d], {{threshold}} * 86400) < 0
+      {%- raw %}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "{{$labels.host}} might exhaust available ram in next week"
+        description: "{{$labels.host}} might exhaust available ram in next week."
+    CephPredictOsdWriteLatency:
+      {%- endraw %}
+      {%- set threshold = monitoring.prediction_threshold %}
+      {%- set write_latency_threshold = monitoring.write_latency_threshold %}
+      if: >-
+        predict_linear(diskio_write_time:rate5m{host=~"osd.*",name=~"sd[b-z]*"}[{{threshold}}d], {{threshold}} * 86400) > avg_over_time(diskio_write_time:rate5m[1d]) * {{write_latency_threshold}}
+      {%- raw %}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time"
+        description: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time. You can verify OSDs with top load using grafana OSD overview page"
+    CephPredictOsdReadLatency:
+      {%- endraw %}
+      {%- set threshold = monitoring.prediction_threshold %}
+      {%- set read_latency_threshold = monitoring.read_latency_threshold %}
+      if: >-
+        predict_linear(diskio_read_time:rate5m{host=~"osd.*",name=~"sd[b-z]*"}[{{threshold}}d], {{threshold}} * 86400) > avg_over_time(diskio_read_time:rate5m[1d]) * {{read_latency_threshold}}
+      {%- raw %}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time"
+        description: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time. You can verify OSDs with top load using grafana OSD overview page"
+      {%- endraw %}
+      {%- endif %}
       {%- if setup.pool is defined %}
         {%- for pool_name, pool in setup.pool.iteritems() %}
           {%- if monitoring.pool is defined and monitoring.pool[pool_name] is defined %}
@@ -281,6 +365,44 @@
       annotations:
         summary: "{{100*threshold}}% of Ceph pool space is used"
         description: "The Ceph {{pool_name}} pool uses {{100*threshold}}% of available space for 3 minutes. For details, run 'ceph df'."
+          {%- if common.get('enable_prediction', False) %}
+    CephPredictPool{{pool_name|replace(".", "")|replace("-", "")}}Space:
+      {%- set threshold = monitoring.prediction_threshold %}
+      {%- set space_threshold = monitoring_pool.space_threshold %}
+      if: >-
+        predict_linear(ceph_pool_bytes_used[{{threshold}}d], {{threshold}} * 86400) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > (ceph_pool_bytes_used + ceph_pool_max_avail) * {{space_threshold}} * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "{{pool_name}} may exhaust more than {{100*space_threshold}}% of available capacity in one week"
+        description: "{{pool_name}} may exhaust more than {{100*space_threshold}}% of available capacity in one week. Consult ceph df and plan proper actions"
+    CephPredictPool{{pool_name|replace(".", "")|replace("-", "")}}IOPSthreshold:
+      {%- set threshold = monitoring.prediction_threshold %}
+      {%- set iops_limit = monitoring_pool.pool_iops_limit %}
+      if: >-
+        predict_linear(ceph_pool_ops:rate5m[{{threshold}}d], {{threshold}} * 86400) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > {{ iops_limit }}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "IOPS on {{pool_name}} are increasing rapdily"
+        description: "IOPS on {{pool_name}} are increasing rapdily."
+    CephPredictPool{{pool_name|replace(".", "")|replace("-", "")}}IOPSauto:
+      {%- set threshold = monitoring.prediction_threshold %}
+      {%- set iops_threshold = monitoring_pool.pool_iops_threshold %}
+      if: >-
+        predict_linear(ceph_pool_ops:rate5m[{{threshold}}d], {{threshold}} * 86400) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > avg_over_time(ceph_pool_ops:rate5m[1d]) * {{ iops_threshold }}
+      for: 30m
+      labels:
+        severity: minor
+        service: ceph
+      annotations:
+        summary: "IOPS on {{pool_name}} are increasing rapdily"
+        description: "IOPS on {{pool_name}} are increasing rapdily."
+          {%- endif -%}
         {%- endfor %}
       {%- endif -%}
commit	18fc7a4678d70d3f6be85b6e99a391aedc59eda7	[log] [tgz]
author	mcp-jenkins <mcp-jenkins@mirantis.com>	Fri Mar 22 18:20:33 2019 +0000
committer	Gerrit Code Review <mail@domain.com>	Fri Mar 22 18:20:33 2019 +0000
tree	115005f1b9ced5671a4d6f962569b4321c0c008c
parent	31030c3f481fc997af66df2ee42adb8535a95d31 [diff]
parent	829f26a8f98671fd562771e1c438f10104d30b04 [diff]