Merge "Set default interval to 40s when Jewel Ceph in use" into release/2019.2.0
diff --git a/ceph/files/grafana_dashboards/legacy/ceph_cluster_prometheus.json b/ceph/files/grafana_dashboards/legacy/ceph_cluster_prometheus.json
index eb1fe5c..211509e 100644
--- a/ceph/files/grafana_dashboards/legacy/ceph_cluster_prometheus.json
+++ b/ceph/files/grafana_dashboards/legacy/ceph_cluster_prometheus.json
@@ -1328,7 +1328,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceph_pgmap_write_op_per_sec or absent(ceph_pgmap_write_op_per_sec) - 1",
+ "expr": "avg(ceph_pgmap_write_op_per_sec)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -1337,7 +1337,7 @@
"step": 60
},
{
- "expr": "ceph_pgmap_read_op_per_sec or absent(ceph_pgmap_read_op_per_sec) - 1",
+ "expr": "avg(ceph_pgmap_read_op_per_sec)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -1421,7 +1421,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceph_pgmap_write_bytes_sec or absent(ceph_pgmap_write_bytes_sec) - 1 ",
+ "expr": "avg(ceph_pgmap_write_bytes_sec)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -1430,7 +1430,7 @@
"step": 60
},
{
- "expr": "ceph_pgmap_read_bytes_sec or absent(ceph_pgmap_read_bytes_sec) - 1",
+ "expr": "avg(ceph_pgmap_read_bytes_sec)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -1637,7 +1637,7 @@
"linewidth": 0,
"links": [],
"minSpan": null,
- "nullPointMode": "connected",
+ "nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@@ -1656,7 +1656,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceph_usage_total_bytes - ceph_usage_total_used_bytes",
+ "expr": "avg(ceph_usage_total_bytes - ceph_usage_total_used_bytes)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -1665,7 +1665,7 @@
"step": 60
},
{
- "expr": "ceph_usage_total_used_bytes or absent(ceph_usage_total_used_bytes) - 1",
+ "expr": "avg(ceph_usage_total_used_bytes)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -1674,7 +1674,7 @@
"step": 60
},
{
- "expr": "ceph_usage_total_bytes or absent(ceph_usage_total_bytes) - 1",
+ "expr": "avg(ceph_usage_total_bytes)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
diff --git a/ceph/files/grafana_dashboards/legacy/ceph_pools_prometheus.json b/ceph/files/grafana_dashboards/legacy/ceph_pools_prometheus.json
index a669b33..1c402e0 100644
--- a/ceph/files/grafana_dashboards/legacy/ceph_pools_prometheus.json
+++ b/ceph/files/grafana_dashboards/legacy/ceph_pools_prometheus.json
@@ -437,7 +437,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "avg(irate(ceph_pool_stats_read_op_per_sec{name=\"$pool\"}[3m])) or absent(avg(irate(ceph_pool_stats_read_op_per_sec{name=\"$pool\"}[3m]))) - 1",
+ "expr": "avg(irate(ceph_pool_stats_read_op_per_sec{name=\"$pool\"}[3m]))",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -446,7 +446,7 @@
"step": 60
},
{
- "expr": "avg(irate(ceph_pool_stats_write_op_per_sec{name=\"$pool\"}[3m])) or absent(avg(irate(ceph_pool_stats_write_op_per_sec{name=\"$pool\"}[3m]))) - 1",
+ "expr": "avg(irate(ceph_pool_stats_write_op_per_sec{name=\"$pool\"}[3m]))",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -530,7 +530,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "avg(irate(ceph_pool_stats_read_bytes_sec{name=\"$pool\"}[3m])) or absent(irate(ceph_pool_stats_read_bytes_sec{name=\"$pool\"}[3m])) - 1",
+ "expr": "avg(irate(ceph_pool_stats_read_bytes_sec{name=\"$pool\"}[3m]))",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -539,7 +539,7 @@
"step": 60
},
{
- "expr": "avg(irate(ceph_pool_stats_write_bytes_sec{name=\"$pool\"}[3m])) or absent(irate(ceph_pool_stats_write_bytes_sec{name=\"$pool\"}[3m])) - 1 ",
+ "expr": "avg(irate(ceph_pool_stats_write_bytes_sec{name=\"$pool\"}[3m]))",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
diff --git a/ceph/map.jinja b/ceph/map.jinja
index b2fde96..9533c09 100644
--- a/ceph/map.jinja
+++ b/ceph/map.jinja
@@ -6,6 +6,7 @@
container_mode: False
prefix_dir: ''
services: []
+ enable_prediction: False
{%- endload %}
{% set common = salt['grains.filter_by'](common_defaults, merge=salt['pillar.get']('ceph:common')) %}
@@ -89,6 +90,14 @@
node_stats: {}
osd_pgnum_warning: 200
osd_pgnum_critical: 300
+ prediction_threshold: 7
+ write_latency_threshold: 1.5
+ read_latency_threshold: 1.5
+ pool_iops_limit: 100
+ pool_iops_threshold: 2
+ osd_iops_limit: 100
+ osd_iops_threshold: 3
+ space_threshold: 0.85
{%- endload %}
{% set monitoring = salt['grains.filter_by'](monitoring_defaults, merge=salt['pillar.get']('ceph:monitoring')) %}
diff --git a/ceph/meta/prometheus.yml b/ceph/meta/prometheus.yml
index d29c409..0c9da93 100644
--- a/ceph/meta/prometheus.yml
+++ b/ceph/meta/prometheus.yml
@@ -158,6 +158,19 @@
{%- if mon is defined and mon.get('enabled') %}
{%- raw %}
server:
+ recording:
+ ceph_osd_op:rate5m:
+ query: >-
+ rate(ceph_osd_op[5m])
+ ceph_pool_ops:rate5m:
+ query: >-
+ rate(ceph_pool_rd[5m]) + rate(ceph_pool_wr[5m])
+ diskio_write_time:rate5m:
+ query: >-
+ rate(diskio_write_time[5m])
+ diskio_read_time:rate5m:
+ query: >-
+ rate(diskio_read_time[5m])
alert:
CephClusterHealthMinor:
if: >-
@@ -252,6 +265,77 @@
summary: "Some OSDs have more than {% endraw %}{{threshold}}{% raw %} PGs"
description: "Some OSDs contain more than {% endraw %}{{threshold}}{% raw %} PGs. This may have a negative impact on the cluster performance. For details, run 'ceph pg dump'"
{%- endraw %}
+ {%- if common.get('enable_prediction', False) %}
+ CephPredictOsdIOPSthreshold:
+ {%- set threshold = monitoring.prediction_threshold %}
+ {%- set osd_iops_limit = monitoring.osd_iops_limit %}
+ if: >-
+ predict_linear(ceph_osd_op:rate5m[{{threshold}}d], {{threshold}} * 86400) > {{osd_iops_limit}}
+ {%- raw %}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily"
+ description: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily."
+ CephPredictOsdIOPSauto:
+ {%- endraw %}
+ {%- set threshold = monitoring.prediction_threshold %}
+ {%- set iops_threshold = monitoring.osd_iops_threshold %}
+ if: >-
+ predict_linear(ceph_osd_op:rate5m[{{threshold}}d], {{threshold}} * 86400) > avg_over_time(ceph_osd_op:rate5m[1d]) * {{ iops_threshold }}
+ {%- raw %}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily"
+ description: "IOPS on osd {{ $labels.ceph_daemon }} are increasing rapdily."
+ CephPredictUsageRAM:
+ {%- endraw %}
+ {%- set threshold = monitoring.prediction_threshold %}
+ if: >-
+ predict_linear(mem_free{host=~"cmn.*|rgw.*|osd.*"}[{{threshold}}d], {{threshold}} * 86400) < 0
+ {%- raw %}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "{{$labels.host}} might exhaust available ram in next week"
+ description: "{{$labels.host}} might exhaust available ram in next week."
+ CephPredictOsdWriteLatency:
+ {%- endraw %}
+ {%- set threshold = monitoring.prediction_threshold %}
+ {%- set write_latency_threshold = monitoring.write_latency_threshold %}
+ if: >-
+ predict_linear(diskio_write_time:rate5m{host=~"osd.*",name=~"sd[b-z]*"}[{{threshold}}d], {{threshold}} * 86400) > avg_over_time(diskio_write_time:rate5m[1d]) * {{write_latency_threshold}}
+ {%- raw %}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time"
+ description: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time. You can verify OSDs with top load using grafana OSD overview page"
+ CephPredictOsdReadLatency:
+ {%- endraw %}
+ {%- set threshold = monitoring.prediction_threshold %}
+ {%- set read_latency_threshold = monitoring.read_latency_threshold %}
+ if: >-
+ predict_linear(diskio_read_time:rate5m{host=~"osd.*",name=~"sd[b-z]*"}[{{threshold}}d], {{threshold}} * 86400) > avg_over_time(diskio_read_time:rate5m[1d]) * {{read_latency_threshold}}
+ {%- raw %}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time"
+ description: "{{$labels.name}} on {{$labels.host}} might become unresponsive in short time. You can verify OSDs with top load using grafana OSD overview page"
+ {%- endraw %}
+ {%- endif %}
{%- if setup.pool is defined %}
{%- for pool_name, pool in setup.pool.iteritems() %}
{%- if monitoring.pool is defined and monitoring.pool[pool_name] is defined %}
@@ -281,6 +365,44 @@
annotations:
summary: "{{100*threshold}}% of Ceph pool space is used"
description: "The Ceph {{pool_name}} pool uses {{100*threshold}}% of available space for 3 minutes. For details, run 'ceph df'."
+ {%- if common.get('enable_prediction', False) %}
+ CephPredictPool{{pool_name|replace(".", "")|replace("-", "")}}Space:
+ {%- set threshold = monitoring.prediction_threshold %}
+ {%- set space_threshold = monitoring_pool.space_threshold %}
+ if: >-
+ predict_linear(ceph_pool_bytes_used[{{threshold}}d], {{threshold}} * 86400) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > (ceph_pool_bytes_used + ceph_pool_max_avail) * {{space_threshold}} * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "{{pool_name}} may exhaust more than {{100*space_threshold}}% of available capacity in one week"
+ description: "{{pool_name}} may exhaust more than {{100*space_threshold}}% of available capacity in one week. Consult ceph df and plan proper actions"
+ CephPredictPool{{pool_name|replace(".", "")|replace("-", "")}}IOPSthreshold:
+ {%- set threshold = monitoring.prediction_threshold %}
+ {%- set iops_limit = monitoring_pool.pool_iops_limit %}
+ if: >-
+ predict_linear(ceph_pool_ops:rate5m[{{threshold}}d], {{threshold}} * 86400) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > {{ iops_limit }}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "IOPS on {{pool_name}} are increasing rapdily"
+ description: "IOPS on {{pool_name}} are increasing rapdily."
+ CephPredictPool{{pool_name|replace(".", "")|replace("-", "")}}IOPSauto:
+ {%- set threshold = monitoring.prediction_threshold %}
+ {%- set iops_threshold = monitoring_pool.pool_iops_threshold %}
+ if: >-
+ predict_linear(ceph_pool_ops:rate5m[{{threshold}}d], {{threshold}} * 86400) * on(pool_id) group_left(name) ceph_pool_metadata{name="{{pool_name}}"} > avg_over_time(ceph_pool_ops:rate5m[1d]) * {{ iops_threshold }}
+ for: 30m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "IOPS on {{pool_name}} are increasing rapdily"
+ description: "IOPS on {{pool_name}} are increasing rapdily."
+ {%- endif -%}
{%- endfor %}
{%- endif -%}