Fixes for Cinder alerts
Change-Id: Ia71d7a69bfac7712ec0de7b3096a5c61bc1fa253
Related-PROD: PROD-19584
diff --git a/cinder/map.jinja b/cinder/map.jinja
index 35af31f..c493410 100644
--- a/cinder/map.jinja
+++ b/cinder/map.jinja
@@ -126,5 +126,6 @@
'error_log_rate': 0.2,
'services_failed_warning_threshold_percent': 0.3,
'services_failed_critical_threshold_percent': 0.6,
+ 'endpoint_failed_major_threshold': 0.5,
},
}, grain='os_family', merge=salt['pillar.get']('cinder:monitoring')) %}
diff --git a/cinder/meta/prometheus.yml b/cinder/meta/prometheus.yml
index 27833b3..c3c911e 100644
--- a/cinder/meta/prometheus.yml
+++ b/cinder/meta/prometheus.yml
@@ -8,6 +8,7 @@
{%- if is_controller %}
{%- set minor_threshold = monitoring.services_failed_warning_threshold_percent|float %}
{%- set major_threshold = monitoring.services_failed_critical_threshold_percent|float %}
+{%- set major_endpoint_threshold = monitoring.endpoint_failed_major_threshold|float %}
{%- raw %}
CinderAPIOutage:
if: >-
@@ -40,6 +41,29 @@
summary: "Host cinder-api endpoint is not accessible"
description: >-
The host cinder-api endpoint on the {{ $labels.host }} node is not accessible for at least 2 minutes.
+{%- endraw %}
+ CinderAPIServiceDownMajor:
+ if: >-
+ count(http_response_status{name=~"cinder-api"} == 0) >= count(http_response_status{name=~"cinder-api"}) * {{ major_endpoint_threshold }}
+ for: 2m
+ labels:
+ severity: major
+ service: cinder
+ annotations:
+ summary: "{{major_endpoint_threshold * 100}}% of host cinder-api endpoints are not accessible"
+ description: >-
+ {% raw %}{{ $value }} host cinder-api endpoints are not accessible for at least 2 minutes (at least {% endraw %}{{major_endpoint_threshold * 100}}{% raw %}%).
+ CinderAPIServiceOutage:
+ if: >-
+ count(http_response_status{name=~"cinder-api"} == 0) == count(http_response_status{name=~"cinder-api"})
+ for: 2m
+ labels:
+ severity: critical
+ service: cinder
+ annotations:
+ summary: "Host cinder-api outage"
+ description: >-
+ All available host cinder-api endpoints are not accessible for at least 2 minutes.
CinderServiceDown:
if: >-
openstack_cinder_service_state == 0
@@ -93,5 +117,6 @@
service: cinder
annotations:
summary: "High number of errors in Cinder logs"
- description: "The rate of errors in Cinder logs over the last 5 minutes is too high on the {{ $labels.host }} node (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }})."
+ description: "The average per-second rate of errors in Cinder logs on the {{ $labels.host }} node is {{ $value }} (as measured over the last 5 minutes)."
+{%- endraw %}
{%- endif %}