Rework Cinder alerts
Related-PROD: PROD-19584
Change-Id: I27b24b49ceb195f29cb131bc5a314eabdbf3ba6f
diff --git a/cinder/meta/prometheus.yml b/cinder/meta/prometheus.yml
index 2b72ae4..4894e50 100644
--- a/cinder/meta/prometheus.yml
+++ b/cinder/meta/prometheus.yml
@@ -6,73 +6,86 @@
server:
alert:
{%- if is_controller %}
+{%- set minor_threshold = monitoring.services_failed_warning_threshold_percent|float %}
+{%- set major_threshold = monitoring.services_failed_critical_threshold_percent|float %}
{%- raw %}
- CinderAPIDown:
+ CinderAPIOutage:
if: >-
- max(openstack_api_check_status{service=~"cinder.*"}) == 0
- for: 2m
- labels:
- severity: down
- service: "{{ $labels.service }}"
- annotations:
- summary: "Endpoint check for '{{ $labels.service }}' is down"
- description: >-
- Endpoint check for '{{ $labels.service }}' is down for the last 2 minutes
- CinderAPIServiceInfo:
- if: >-
- http_response_status{service=~"cinder-api"} == 0
- for: 2m
- labels:
- severity: info
- service: "{{ $labels.service }}"
- annotations:
- summary: "HTTP check for '{{ $labels.service }}' down"
- description: >-
- The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes.
- CinderServicesInfo:
- if: >-
- openstack_cinder_service == 1
- for: 2m
- labels:
- severity: info
- service: "{{ $labels.service }}"
- annotations:
- summary: "'{{ $labels.service }}' is down"
- description: >-
- '{{ $labels.service }}' is down on {{ $labels.hostname }} for the last 2 minutes.
- CinderServicesWarning:
- if: >-
- openstack_cinder_services{service=~"cinder-volume|cinder-scheduler", state="down"} >= on (service) sum(openstack_cinder_services{service=~"cinder-volume|cinder-scheduler"}) by (service) * {%- endraw %} {{monitoring.services_failed_warning_threshold_percent}} {%- raw %}
- for: 2m
- labels:
- severity: warning
- service: "{{ $labels.service }}"
- annotations:
- summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
- description: >-
- {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})
- CinderServicesCritical:
- if: >-
- openstack_cinder_services{service=~"cinder-volume|cinder-scheduler", state="down"} >= on (service) sum(openstack_cinder_services{service=~"cinder-volume|cinder-scheduler"}) by (service) * {%- endraw %} {{monitoring.services_failed_critical_threshold_percent}} {%- raw %}
- for: 2m
+ max(openstack_api_check_status{name=~"cinder.*"}) == 0
labels:
severity: critical
- service: "{{ $labels.service }}"
+ service: cinder
annotations:
- summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
+ summary: "Cinder API outage"
description: >-
- {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})
- CinderServicesDown:
+ Cinder API check for the every available in the Openstack service catalog Cinder endpoint is down.
+ CinderAPIDown:
if: >-
- openstack_cinder_services{state="up",service=~"cinder-volume|cinder-scheduler"} == 0
+ openstack_api_check_status{name=~"cinder.*"} == 0
+ labels:
+ severity: major
+ service: cinder
+ annotations:
+ summary: "'{{ $labels.name }}' endpoint is down"
+ description: >-
+ Cinder API check for the '{{ $labels.name }}' endpoint is down.
+ CinderAPIServiceDown:
+ if: >-
+ http_response_status{name=~"cinder-api"} == 0
for: 2m
labels:
- severity: down
- service: "{{ $labels.service }}"
+ severity: minor
+ service: cinder
annotations:
- summary: "All {{ $labels.service }} services are down"
+ summary: "The HTTP check for the 'cinder-api' is down"
description: >-
- All {{ $labels.service }} services are down for the last 2 minutes
+ The HTTP check for the 'cinder-api' on the '{{ $labels.host }}' is down for 2 minutes.
+ CinderServiceDown:
+ if: >-
+ openstack_cinder_service_state == 0
+ labels:
+ severity: minor
+ service: cinder
+ annotations:
+ summary: "The '{{ $labels.binary }}' is in down state"
+ description: >-
+ The '{{ $labels.binary }}' on the '{{ $labels.host }}' is in down state.
+{%- endraw %}
+ CinderServicesDownMinor:
+ if: >-
+ count(openstack_cinder_service_state == 0) by (binary) >= on (binary) count(openstack_cinder_service_state) by (binary) * {{minor_threshold}} and count(openstack_cinder_service_state == 0) by (binary) < on (binary) count(openstack_cinder_service_state) by (binary) * {{major_threshold}}
+{%- raw %}
+ labels:
+ severity: minor
+ service: cinder
+ annotations:
+ summary: "Medium percentage of '{{ $labels.binary }}' services are in down state"
+ description: >-
+ {{ $value }} '{{ $labels.binary }}' services are in down state {%- endraw %}(More than {{minor_threshold * 100}}% and less than {{major_threshold * 100}}%).{%- raw %}
+{%- endraw %}
+ CinderServicesDownMajor:
+ if: >-
+ count(openstack_cinder_service_state == 0) by (binary) >= on (binary) count(openstack_cinder_service_state) by (binary) * {{major_threshold}}
+{%- raw %}
+ labels:
+ severity: major
+ service: cinder
+ annotations:
+ summary: "High percentage of '{{ $labels.binary }}' services are in down state"
+ description: >-
+ {{ $value }} '{{ $labels.binary }}' services are in down state {%- endraw %}(More than {{major_threshold * 100}}%).{%- raw %}
+{%- endraw %}
+ CinderServiceOutage:
+ if: >-
+ count(openstack_cinder_service_state == 0) by (binary) == on (binary) count(openstack_cinder_service_state) by (binary)
+{%- raw %}
+ labels:
+ severity: critical
+ service: cinder
+ annotations:
+ summary: "'{{ $labels.binary }}' service outage"
+ description: >-
+ All '{{ $labels.binary }}' services are in down state.
{%- endraw %}
{%- endif %}
CinderErrorLogsTooHigh:
@@ -82,8 +95,8 @@
{%- raw %}
labels:
severity: warning
- service: "{{ $labels.service }}"
+ service: cinder
annotations:
- summary: 'Too many errors in {{ $labels.service }} logs'
- description: 'The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }}).'
+ summary: "There are too many errors in Cinder logs"
+ description: "The rate of errors in Cinder logs over the last 5 minutes is too high on node '{{ $labels.host }}' (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }})."
{%- endif %}