Alerts reworked
Change alerts names, severity and descriptions.
Change-Id: I96996c7d6fc02ec5fd773a6987fc4a3dd5724c23
Closes-bug: PROD-20283
diff --git a/ceph/map.jinja b/ceph/map.jinja
index b8eb105..3140a4c 100644
--- a/ceph/map.jinja
+++ b/ceph/map.jinja
@@ -88,6 +88,7 @@
cluster_stats: {}
node_stats: {}
{%- endload %}
+
{% set monitoring = salt['grains.filter_by'](monitoring_defaults, merge=salt['pillar.get']('ceph:monitoring')) %}
{%- load_yaml as backup_defaults %}
@@ -110,6 +111,5 @@
{% set backup = salt['grains.filter_by'](backup_defaults['backup'], merge=salt['pillar.get']('ceph:backup', {})) %}
-
{#- vim:ft=sls
-#}
diff --git a/ceph/meta/prometheus.yml b/ceph/meta/prometheus.yml
index fe2cb57..e8dc6ca 100644
--- a/ceph/meta/prometheus.yml
+++ b/ceph/meta/prometheus.yml
@@ -1,150 +1,170 @@
-{%- from "ceph/map.jinja" import mon, monitoring, setup with context %}
+{%- from "ceph/map.jinja" import thresholds, mon, monitoring, setup with context %}
{%- if (mon is defined and mon.get('enabled')) or (monitoring.cluster_stats.get('enabled') and monitoring.cluster_stats.ceph_user is defined) %}
{% raw %}
server:
alert:
- CephHealthCritical:
- if: >-
- ceph_overall_health == 3
- labels:
- severity: critical
- service: ceph
- annotations:
- summary: "Ceph health critical"
- description: "Ceph health is 'critical'. Run 'ceph -s' to get details."
- CephHealthWarning:
+ CephClusterHealthMinor:
if: >-
ceph_overall_health == 2
+ for: 3m
labels:
- severity: warning
+ severity: minor
service: ceph
annotations:
- summary: "Ceph health warning"
- description: "Ceph health is 'warning'. Run 'ceph -s' to get details."
- CephNumMonQuorumWarning:
+ summary: "Ceph cluster health is WARNING"
+ description: "The Ceph cluster is in the WARNING state. For details, run 'ceph -s'."
+ CephClusterHealthCritical:
if: >-
- ceph_num_mon > ceph_num_mon_quorum
- labels:
- severity: warning
- service: ceph
- annotations:
- summary: "Ceph Mon node down warning"
- description: "Ceph Mon node is down. Run 'ceph -s' to get details."
- CephNumOsdWarning:
- if: >-
- ceph_osdmap_num_osds > ceph_osdmap_num_up_osds
- labels:
- severity: warning
- service: ceph
- annotations:
- summary: "Ceph OSDs down warning"
- description: "Ceph OSD is down. Run 'ceph osd tree' to get details."
- CephUsedSpaceWarning:
- if: >-
- ceph_osd_bytes_used / ceph_osd_bytes > {%- endraw %} {{monitoring.space_used_warning_threshold|default('0.75')|float}} {%- raw %}
- labels:
- severity: warning
- service: ceph
- annotations:
- summary: "Ceph used space warning"
- description: "Ceph OSD free space utilization warning. Run 'ceph df' to get details."
- CephUsedSpaceCritical:
- if: >-
- ceph_osd_bytes_used / ceph_osd_bytes > {%- endraw %} {{monitoring.space_used_critical_threshold|default('0.85')|float}} {%- raw %}
+ ceph_overall_health == 3
+ for: 3m
labels:
severity: critical
service: ceph
annotations:
- summary: "Ceph used space critical"
- description: "Ceph OSD free space utilization critical. Run 'ceph df' to get details."
- CephApplyLatencyTooHigh:
+ summary: "Ceph cluster health is CRITICAL"
+ description: "The Ceph cluster is in the CRITICAL state. For details, run 'ceph -s'."
+ CephMonitorDownMinor:
if: >-
- avg(ceph_apply_latency_sum) / avg(ceph_apply_latency_avgcount) > {%- endraw %} {{monitoring.apply_latency_threshold|default('0.007')|float}} {%- raw %}
+ 100 * (1 - ceph_num_mon_quorum / ceph_num_mon) > 0
+ for: 3m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "Ceph Monitors are down"
+ description: "{{ $value }}% of Ceph Monitors are down. For details, run 'ceph -s'."
+ CephOsdDownMinor:
+ if: >-
+ 100 * (1 - ceph_osdmap_num_up_osds / ceph_osdmap_num_osds) > 0
+ for: 3m
+ labels:
+ severity: minor
+ service: ceph
+ annotations:
+ summary: "Ceph OSDs are down"
+ description: "{{ $value }}% of Ceph OSDs are down. For details, run 'ceph osd tree'."
+ CephOsdSpaceUsageWarning:
+ {%- endraw %}
+ {%- set threshold = monitoring.space_used_warning_threshold|default('0.75')|float %}
+ if: >-
+ ceph_osd_bytes_used > ceph_osd_bytes * {{threshold}}
+ {%- raw %}
+ for: 3m
labels:
severity: warning
service: ceph
annotations:
- summary: "Ceph apply latency too high"
- description: "Ceph apply latency too high."
- CephCommitLatencyTooHigh:
+ summary: "{%-endraw %}{{100*threshold}}{%- raw %}% of Ceph space is used"
+ description: "{{ $value }} bytes of Ceph OSD space (>= {%-endraw %}{{100*threshold}}{%- raw %}%) is used for 3 minutes. For details, run 'ceph df'."
+ CephOsdSpaceUsageMajor:
+ {%- endraw %}
+ {%- set threshold = monitoring.space_used_critical_threshold|default('0.85')|float %}
if: >-
- avg(ceph_commit_latency_sum) / avg(ceph_commitcycle_latency_avgcount) > {%- endraw %} {{monitoring.commit_latency_threshold|default('0.7')|float}} {%- raw %}
+ ceph_osd_bytes_used > ceph_osd_bytes * {{threshold}}
+ {%- raw %}
+ for: 3m
+ labels:
+ severity: major
+ service: ceph
+ annotations:
+ summary: "{%-endraw %}{{100*threshold}}{%- raw %}% of Ceph space is used"
+ description: "{{ $ value }} bytes of Ceph OSD space (>= {%-endraw %}{{100*threshold}}{%- raw %}%) is used for 3 minutes. For details, run 'ceph df'."
+ CephServiceApplyLatencyTooHigh:
+ {%- endraw %}
+ {%- set threshold = monitoring.apply_latency_threshold|default('0.007')|float %}
+ if: >-
+ avg(ceph_apply_latency_sum) / avg(ceph_apply_latency_avgcount) > {{threshold}}
+ {%- raw %}
+ for: 3m
labels:
severity: warning
service: ceph
annotations:
- summary: "Ceph commit latency too high"
- description: "Ceph commit latency too high."
+ summary: "Ceph apply latency reached the limit of {%- endraw %}{{threshold}}{%- raw %}s"
+ description: "The average Ceph apply latency is more than {%- endraw %}{{threshold}}{%- raw %} seconds for 3 minutes."
+ CephServiceCommitLatencyTooHigh:
+ {%- endraw %}
+ {%- set threshold = monitoring.commit_latency_threshold|default('0.7')|float %}
+ if: >-
+ avg(ceph_commit_latency_sum) / avg(ceph_commitcycle_latency_avgcount) > {{threshold}}
+ {%- raw %}
+ for: 3m
+ labels:
+ severity: warning
+ service: ceph
+ annotations:
+ summary: "Ceph commit latency reached the limit of {%- endraw %}{{threshold}}{%- raw %}s"
+ description: "The average Ceph commit latency is more than {%- endraw %}{{threshold}}{%- raw %} seconds for 3 minutes."
{% endraw %}
{%- if setup.pool is defined %}
{%- for pool_name, pool in setup.pool.iteritems() %}
- CephPoolUsedSpaceWarning{{pool_name|replace(".", "")|replace("-", "")}}:
-{% raw %}
+ CephPool{{pool_name|replace(".", "")|replace("-", "")}}SpaceUsageWarning:
+ {%- set threshold = monitoring.pool_space_used_utilization_warning_threshold|default('0.75')|float %}
if: >-
- ceph_pool_usage_bytes_used{name="{% endraw %}{{pool_name}}{% raw %}"} / ceph_pool_usage_max_avail{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_space_used_utilization_warning_threshold|default('0.75')|float}} {% raw %}
+ ceph_pool_usage_bytes_used{name="{{pool_name}}"} / ceph_pool_usage_max_avail{name="{{pool_name}}"} > {{threshold}}
+ for: 3m
labels:
severity: warning
service: ceph
annotations:
- summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} space utilization warning"
- description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} free space utilization warning. Run 'ceph df' to get details."
-{% endraw %}
- CephPoolUsedSpaceCritical{{pool_name|replace(".", "")|replace("-", "")}}:
-{% raw %}
+ summary: "{{100*threshold}}% of Ceph pool space is used"
+ description: "The Ceph {{pool_name}} pool uses {{100*threshold}}% of available space for 3 minutes. For details, run 'ceph df'."
+ CephPool{{pool_name|replace(".", "")|replace("-", "")}}SpaceUsageMinor:
+ {%- set threshold = monitoring.pool_space_used_critical_threshold|default('0.85')|float %}
if: >-
- ceph_pool_usage_bytes_used{name="{% endraw %}{{pool_name}}{% raw %}"} / ceph_pool_usage_max_avail{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_space_used_critical_threshold|default('0.85')|float}} {% raw %}
+ ceph_pool_usage_bytes_used{name="{{pool_name}}"} / ceph_pool_usage_max_avail{name="{{pool_name}}"} > {{threshold}}
+ for: 3m
labels:
- severity: critical
+ severity: minor
service: ceph
annotations:
- summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} space utilization critical"
- description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} free space utilization critical. Run 'ceph df' to get details."
-{% endraw %}
- CephPoolWriteOpsTooHigh{{pool_name|replace(".", "")|replace("-", "")}}:
-{% raw %}
+ summary: "{{100*threshold}}% of Ceph pool space is used"
+ description: "The Ceph {{pool_name}} pool uses {{100*threshold}}% of available space for 3 minutes. For details, run 'ceph df'."
+ CephPool{{pool_name|replace(".", "")|replace("-", "")}}WriteOpsTooHigh:
+ {%- set threshold = monitoring.pool_write_ops_threshold|default('200')|float %}
if: >-
- ceph_pool_stats_write_op_per_sec{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_write_ops_threshold|default('200')|float}} {% raw %}
+ ceph_pool_stats_write_op_per_sec{name="{{pool_name}}"} > {{threshold}}
+ for: 3m
labels:
severity: warning
service: ceph
annotations:
- summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} write ops too high"
- description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} write ops too high."
-{% endraw %}
- CephPoolWriteBytesTooHigh{{pool_name|replace(".", "")|replace("-", "")}}:
-{% raw %}
+ summary: "{{threshold}} Ceph pool write operations per second"
+ description: "The number of Ceph {{pool_name}} pool write operations per second is {{threshold}} for 3 minutes."
+ CephPool{{pool_name|replace(".", "")|replace("-", "")}}WriteBytesTooHigh:
+ {%- set threshold = monitoring.pool_write_bytes_threshold|default('70000000')|float %}
if: >-
- ceph_pool_stats_write_bytes_sec{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_write_bytes_threshold|default('70000000')|float}} {% raw %}
+ ceph_pool_stats_write_bytes_sec{name="{{pool_name}}"} > {{threshold}}
+ for: 3m
labels:
severity: warning
service: ceph
annotations:
- summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} write bytes too high"
- description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} write bytes too high."
-{% endraw %}
- CephPoolReadBytesTooHigh{{pool_name|replace(".", "")|replace("-", "")}}:
-{% raw %}
+ summary: "{{threshold}} Ceph pool write bytes per second"
+ description: "The number of Ceph {{pool_name}} pool write bytes per second is {{threshold}} for 3 minutes."
+ CephPool{{pool_name|replace(".", "")|replace("-", "")}}ReadOpsTooHigh:
+ {%- set threshold = monitoring.pool_read_ops_threshold|default('1000')|float %}
if: >-
- ceph_pool_stats_read_bytes_sec{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_read_bytes_threshold|default('70000000')|float}} {% raw %}
+ ceph_pool_stats_read_op_per_sec{name="{{pool_name}}"} > {{threshold}}
+ for: 3m
labels:
severity: warning
service: ceph
annotations:
- summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} read bytes too high"
- description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} read bytes too high."
-{% endraw %}
- CephPoolReadOpsTooHigh{{pool_name|replace(".", "")|replace("-", "")}}:
-{% raw %}
+ summary: "{{threshold}} Ceph pool read operations per second"
+ description: "The number of Ceph {{pool_name}} pool read operations per second is {{threshold}} for 3 minutes."
+ CephPool{{pool_name|replace(".", "")|replace("-", "")}}ReadBytesTooHigh:
+ {%- set threshold = monitoring.pool_read_bytes_threshold|default('70000000')|float %}
if: >-
- ceph_pool_stats_read_op_per_sec{name="{% endraw %}{{pool_name}}{% raw %}"} > {% endraw %} {{monitoring.pool_read_ops_threshold|default('1000')|float}} {% raw %}
+ ceph_pool_stats_read_bytes_sec{name="{{pool_name}}"} > {{threshold}}
+ for: 3m
labels:
severity: warning
service: ceph
annotations:
- summary: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} read ops too high"
- description: "Ceph POOL {% endraw %}{{pool_name}}{% raw %} read ops too high."
-{% endraw %}
+ summary: "{{threshold}} Ceph pool read bytes per second"
+ description: "The number of Ceph {{pool_name}} pool read bytes per second is {{threshold}} for 3 minutes."
{%- endfor %}
{%- endif %}
{%- endif %}