Add threshold for nova services alerts
Could be configured through new variables:
* services_failed_warning_threshold_percent
* services_failed_critical_threshold_percent
* computes_failed_warning_threshold_percent
* computes_failed_critical_threshold_percent
Change-Id: I7cf5f00f4384776661dde5e07883449c37715ae2
Closes-Bug: PROD-15201
diff --git a/nova/map.jinja b/nova/map.jinja
index f725112..d8610f6 100644
--- a/nova/map.jinja
+++ b/nova/map.jinja
@@ -114,5 +114,9 @@
'error_log_rate': {
'warn': 0.2,
},
+ 'services_failed_warning_threshold_percent': 0.3,
+ 'services_failed_critical_threshold_percent': 0.6,
+ 'computes_failed_warning_threshold_percent': 0.25,
+ 'computes_failed_critical_threshold_percent': 0.5,
},
}, grain='os_family', merge=salt['pillar.get']('nova:monitoring')) %}
diff --git a/nova/meta/prometheus.yml b/nova/meta/prometheus.yml
index c9d1a8d..8599418 100644
--- a/nova/meta/prometheus.yml
+++ b/nova/meta/prometheus.yml
@@ -41,7 +41,7 @@
annotations:
summary: "Endpoint check for '{{ $labels.service }}' is down"
description: >-
- Endpoint check for '{{ $labels.service }}' is down for 2 minutes
+ Endpoint check for '{{ $labels.service }}' is down for the last 2 minutes
NovaAPIServiceDown:
if: >-
http_response_status{service=~"nova-api"} == 0
@@ -52,30 +52,30 @@
annotations:
summary: "HTTP check for '{{ $labels.service }}' down"
description: >-
- The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for 2 minutes.
- NovaSomeServicesDown:
+ The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes.
+ NovaServicesWarning:
if: >-
- openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} > 0 and ignoring(state) openstack_nova_services{state="up",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= 2
+ openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= on (service) sum(openstack_nova_services{service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"}) by (service) * {%- endraw %} {{monitoring.services_failed_warning_threshold_percent}} {%- raw %}
for: 2m
labels:
severity: warning
service: "{{ $labels.service }}"
annotations:
- summary: "Some {{ $labels.service }} services down"
+ summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
description: >-
- {{ $value }} '{{ $labels.service }}' service(s) is/are down for 2 minutes
- NovaOnlyOneServiceUp:
+ More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
+ NovaServicesCritical:
if: >-
- openstack_nova_services{state="up",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} == 1 and ignoring(state) openstack_nova_services{state=~"down|disabled",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} > 0
+ openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= on (service) sum(openstack_nova_services{service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"}) by (service) * {%- endraw %} {{monitoring.services_failed_critical_threshold_percent}} {%- raw %}
for: 2m
labels:
severity: critical
service: "{{ $labels.service }}"
annotations:
- summary: "Only one {{ $labels.service }} service up"
+ summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
description: >-
- Only one '{{ $labels.service }}' service is up for 2 minutes
- NovaAllServicesDown:
+ More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
+ NovaServicesDown:
if: >-
openstack_nova_services{state="up",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} == 0
for: 2m
@@ -85,30 +85,30 @@
annotations:
summary: "All {{ $labels.service }} services down"
description: >-
- All '{{ $labels.service }}' services are down for 2 minutes
- NovaSomeComputesDown:
+ All '{{ $labels.service }}' services are down for the last 2 minutes
+ NovaComputesWarning:
if: >-
- openstack_nova_services{state="down",service=~"nova-compute"} > 0
+ openstack_nova_services{state="down",service=~"nova-compute"} >= on (service) sum(openstack_nova_services{service=~"nova-compute"}) by (service) * {%- endraw %} {{monitoring.computes_failed_warning_threshold_percent}} {%- raw %}
for: 2m
labels:
severity: warning
service: "{{ $labels.service }}"
annotations:
- summary: "Some {{ $labels.service }} services down"
+ summary: "More than {%- endraw %} {{monitoring.computes_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
description: >-
- {{ $value }} '{{ $labels.service }}' service(s) is/are down for 2 minutes
- NovaMajorityComputesDown:
+ More than {%- endraw %} {{monitoring.computes_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
+ NovaComputesCritical:
if: >-
- openstack_nova_services_percent{state="down",service=~"nova-compute"} > 50
+ openstack_nova_services_percent{state="down",service=~"nova-compute"} >= on (service) sum(openstack_nova_services{service=~"nova-compute"}) by (service) * {%- endraw %} {{monitoring.computes_failed_critical_threshold_percent}} {%- raw %}
for: 2m
labels:
severity: critical
service: "{{ $labels.service }}"
annotations:
- summary: "Only one {{ $labels.service }} service up"
+ summary: "More than {%- endraw %} {{monitoring.computes_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
description: >-
- Only one '{{ $labels.service }}' service is up for 2 minutes
- NovaAllComputesDown:
+ More than {%- endraw %} {{monitoring.computes_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
+ NovaComputesDown:
if: >-
openstack_nova_services{state="up",service=~"nova-compute"} == 0
for: 2m
@@ -116,9 +116,9 @@
severity: down
service: "{{ $labels.service }}"
annotations:
- summary: "All {{ $labels.service }} services down"
+ summary: "All {{ $labels.service }} services are down"
description: >-
- All '{{ $labels.service }}' services are down for 2 minutes
+ All '{{ $labels.service }}' services are down for the last 2 minutes
NovaTotalFreeVCPUsLow:
if: >-
(100.0 * openstack_nova_total_free_vcpus) / (openstack_nova_total_free_vcpus + openstack_nova_total_used_vcpus) < 10.0