Add threshold to cinder services alerts
Change-Id: I01b56bb678f967953eff2ef65c655563e2c95bfb
Closes-Bug: PROD-15128
diff --git a/cinder/map.jinja b/cinder/map.jinja
index 134ac55..b8806e6 100644
--- a/cinder/map.jinja
+++ b/cinder/map.jinja
@@ -89,5 +89,7 @@
{% set monitoring = salt['grains.filter_by']({
'default': {
'error_log_rate': 0.2,
+ 'services_failed_warning_threshold_percent': 0.3,
+ 'services_failed_critical_threshold_percent': 0.6,
},
}, grain='os_family', merge=salt['pillar.get']('cinder:monitoring')) %}
diff --git a/cinder/meta/prometheus.yml b/cinder/meta/prometheus.yml
index d4d3780..dca35fb 100644
--- a/cinder/meta/prometheus.yml
+++ b/cinder/meta/prometheus.yml
@@ -17,41 +17,52 @@
annotations:
summary: "Endpoint check for '{{ $labels.service }}' is down"
description: >-
- Endpoint check for '{{ $labels.service }}' is down for 2 minutes
- CinderAPIServiceDown:
+ Endpoint check for '{{ $labels.service }}' is down for the last 2 minutes
+ CinderAPIServiceInfo:
if: >-
http_response_status{service=~"cinder-api"} == 0
for: 2m
labels:
- severity: down
+ severity: info
service: "{{ $labels.service }}"
annotations:
summary: "HTTP check for '{{ $labels.service }}' down"
description: >-
- The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for 2 minutes.
- CinderSomeServicesDown:
+ The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes.
+ CinderServicesInfo:
if: >-
- openstack_cinder_services{state="down",service=~"cinder-volume|cinder-scheduler"} > 0 and ignoring (state) openstack_cinder_services{state="up",service=~"cinder-volume|cinder-scheduler"} >= 2
+ openstack_cinder_service == 1
+ for: 2m
+ labels:
+ severity: info
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "'{{ $labels.service }}' is down"
+ description: >-
+ '{{ $labels.service }}' is down on {{ $labels.hostname }} for the last 2 minutes.
+ CinderServicesWarning:
+ if: >-
+ openstack_cinder_services{service=~"cinder-volume|cinder-scheduler", state="down"} >= on (service) sum(openstack_cinder_services{service=~"cinder-volume|cinder-scheduler"}) by (service) * {%- endraw %} {{monitoring.services_failed_warning_threshold_percent}} {%- raw %}
for: 2m
labels:
severity: warning
service: "{{ $labels.service }}"
annotations:
- summary: "Some {{ $labels.service }} services are down"
+ summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
description: >-
- {{ $value }} {{ $labels.service }} services are down for 2 minutes
- CinderOnlyOneServiceUp:
+ {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})
+ CinderServicesCritical:
if: >-
- openstack_cinder_services{state="up",service=~"cinder-volume|cinder-scheduler"} == 1 and ignoring (state) openstack_cinder_services{state="down",service=~"cinder-volume|cinder-scheduler"} > 0
+ openstack_cinder_services{service=~"cinder-volume|cinder-scheduler", state="down"} >= on (service) sum(openstack_cinder_services{service=~"cinder-volume|cinder-scheduler"}) by (service) * {%- endraw %} {{monitoring.services_failed_critical_threshold_percent}} {%- raw %}
for: 2m
labels:
severity: critical
service: "{{ $labels.service }}"
annotations:
- summary: "Only one {{ $labels.service }} service is up"
+ summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
description: >-
- Only one {{ $labels.service }} service is up for 2 minutes
- CinderAllServicesDown:
+ {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})
+ CinderServicesDown:
if: >-
openstack_cinder_services{state="up",service=~"cinder-volume|cinder-scheduler"} == 0
for: 2m
@@ -61,7 +72,7 @@
annotations:
summary: "All {{ $labels.service }} services are down"
description: >-
- All {{ $labels.service }} services are down for 2 minutes
+ All {{ $labels.service }} services are down for the last 2 minutes
{%- endraw %}
{%- endif %}
CinderErrorLogsTooHigh: