Merge "Rework Nova service alerts"
diff --git a/nova/meta/prometheus.yml b/nova/meta/prometheus.yml
index 24cdac6..9029265 100644
--- a/nova/meta/prometheus.yml
+++ b/nova/meta/prometheus.yml
@@ -30,95 +30,103 @@
server:
alert:
{%- if is_controller %}
+{%- set minor_threshold = monitoring.services_failed_warning_threshold_percent|float %}
+{%- set major_threshold = monitoring.services_failed_critical_threshold_percent|float %}
+{%- set minor_compute_threshold = monitoring.computes_failed_warning_threshold_percent|float %}
+{%- set major_compute_threshold = monitoring.computes_failed_critical_threshold_percent|float %}
{% raw %}
+ NovaAPIOutage:
+ if: >-
+ max(openstack_api_check_status{name=~"nova.*|placement"}) == 0
+ labels:
+ severity: critical
+ service: nova
+ annotations:
+ summary: "Nova API outage"
+ description: >-
+ Nova API is not accessible for all available Nova endpoints in the OpenStack service catalog.
NovaAPIDown:
if: >-
- openstack_api_check_status{service=~"nova.*|placement"} == 0
- for: 2m
+ openstack_api_check_status{name=~"nova.*|placement"} == 0
labels:
- severity: down
- service: "{{ $labels.service }}"
+ severity: major
+ service: nova
annotations:
- summary: "Endpoint check for '{{ $labels.service }}' is down"
+ summary: "{{ $labels.name }} endpoint is not accessible"
description: >-
- Endpoint check for '{{ $labels.service }}' is down for the last 2 minutes
+ Nova API is not accessible for the {{ $labels.name }} endpoint.
NovaAPIServiceDown:
if: >-
- http_response_status{service=~"nova-api"} == 0
+ http_response_status{name=~"nova-api"} == 0
for: 2m
labels:
- severity: down
- service: "{{ $labels.service }}"
+ severity: minor
+ service: nova
annotations:
- summary: "HTTP check for '{{ $labels.service }}' down"
+ summary: "Host nova-api endpoint is not accessible"
description: >-
- The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes.
- NovaServicesWarning:
+ The host nova-api endpoint on the {{ $labels.host }} node is not accessible for at least 2 minutes.
+ NovaServiceDown:
if: >-
- openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= on (service) sum(openstack_nova_services{service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"}) by (service) * {%- endraw %} {{monitoring.services_failed_warning_threshold_percent}} {%- raw %}
- for: 2m
+ openstack_nova_service_state == 0
labels:
- severity: warning
- service: "{{ $labels.service }}"
+ severity: minor
+ service: nova
annotations:
- summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
+ summary: "{{ $labels.binary }} service is down"
description: >-
- More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
- NovaServicesCritical:
+ The {{ $labels.binary }} service on the {{ $labels.hostname }} node is down.
+{%- endraw %}
+ NovaServicesDownMinor:
if: >-
- openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= on (service) sum(openstack_nova_services{service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"}) by (service) * {%- endraw %} {{monitoring.services_failed_critical_threshold_percent}} {%- raw %}
- for: 2m
+ count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) >= on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * {{minor_threshold}} and count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) < on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * {{major_threshold}}
+ labels:
+ severity: minor
+ service: nova
+ annotations:
+ summary: "{{minor_threshold * 100}}%{%- raw %} of {{ $labels.binary }} services are down"
+ description: >-
+ {{ $value }} {{ $labels.binary }} services are down {%- endraw %}(at least {{minor_threshold * 100}}%).
+ NovaComputeServicesDownMinor:
+ if: >-
+ count(openstack_nova_service_state{binary="nova-compute"} == 0) >= count(openstack_nova_service_state{binary="nova-compute"}) * {{minor_compute_threshold}} and count(openstack_nova_service_state{binary="nova-compute"} == 0) < count(openstack_nova_service_state{binary="nova-compute"}) * {{major_compute_threshold}}
+ labels:
+ severity: minor
+ service: nova
+ annotations:
+ summary: "{{minor_compute_threshold * 100}}%{%- raw %} of nova-compute services are down"
+ description: >-
+ {{ $value }} nova-compute services are down {%- endraw %}(at least {{minor_compute_threshold * 100}}%).
+ NovaServicesDownMajor:
+ if: >-
+ count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) >= on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * {{major_threshold}}
+ labels:
+ severity: major
+ service: nova
+ annotations:
+ summary: "{{major_threshold * 100}}%{%- raw %} of {{ $labels.binary }} services are down"
+ description: >-
+ {{ $value }} {{ $labels.binary }} services are down {%- endraw %}(at least {{major_threshold * 100}}%).
+ NovaComputeServicesDownMajor:
+ if: >-
+ count(openstack_nova_service_state{binary="nova-compute"} == 0) >= count(openstack_nova_service_state{binary="nova-compute"}) * {{major_compute_threshold}}
+ labels:
+ severity: major
+ service: nova
+ annotations:
+ summary: "{{major_compute_threshold * 100}}%{%- raw %} of nova-compute services are down"
+ description: >-
+ {{ $value }} nova-compute services are down {%- endraw %}(at least {{major_compute_threshold * 100}}%).{%- raw %}
+ NovaServiceOutage:
+ if: >-
+ count(openstack_nova_service_state == 0) by (binary) == on (binary) count(openstack_nova_service_state) by (binary)
labels:
severity: critical
- service: "{{ $labels.service }}"
+ service: nova
annotations:
- summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
+ summary: "{{ $labels.binary }} service outage"
description: >-
- More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
- NovaServicesDown:
- if: >-
- openstack_nova_services{state="up",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} == 0
- for: 2m
- labels:
- severity: down
- service: "{{ $labels.service }}"
- annotations:
- summary: "All {{ $labels.service }} services down"
- description: >-
- All '{{ $labels.service }}' services are down for the last 2 minutes
- NovaComputesWarning:
- if: >-
- openstack_nova_services{state="down",service=~"nova-compute"} >= on (service) sum(openstack_nova_services{service=~"nova-compute"}) by (service) * {%- endraw %} {{monitoring.computes_failed_warning_threshold_percent}} {%- raw %}
- for: 2m
- labels:
- severity: warning
- service: "{{ $labels.service }}"
- annotations:
- summary: "More than {%- endraw %} {{monitoring.computes_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
- description: >-
- More than {%- endraw %} {{monitoring.computes_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
- NovaComputesCritical:
- if: >-
- openstack_nova_services_percent{state="down",service=~"nova-compute"} >= on (service) sum(openstack_nova_services{service=~"nova-compute"}) by (service) * {%- endraw %} {{monitoring.computes_failed_critical_threshold_percent}} {%- raw %}
- for: 2m
- labels:
- severity: critical
- service: "{{ $labels.service }}"
- annotations:
- summary: "More than {%- endraw %} {{monitoring.computes_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
- description: >-
- More than {%- endraw %} {{monitoring.computes_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
- NovaComputesDown:
- if: >-
- openstack_nova_services{state="up",service=~"nova-compute"} == 0
- for: 2m
- labels:
- severity: down
- service: "{{ $labels.service }}"
- annotations:
- summary: "All {{ $labels.service }} services are down"
- description: >-
- All '{{ $labels.service }}' services are down for the last 2 minutes
+ All {{ $labels.binary }} services are down.
NovaTotalFreeVCPUsLow:
if: >-
(100.0 * openstack_nova_total_free_vcpus) / (openstack_nova_total_free_vcpus + openstack_nova_total_used_vcpus) < 10.0
@@ -220,27 +228,23 @@
{%- raw %}
labels:
severity: warning
- service: "{{ $labels.service }}"
+ service: nova
annotations:
- summary: 'Too many errors in {{ $labels.service }} logs'
- description: 'The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }}).'
-
-{%- if is_compute %}
+ summary: "High number of errors in Nova logs"
+ description: "The rate of errors in Nova logs over the last 5 minutes is too high on the {{ $labels.host }} node (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }})."
+{%- if is_compute and exporters is defined %}
{%- raw %}
- NovaLibvirtDown:
+ LibvirtDown:
if: >-
- max(libvirt_up) by (host) == 0
+ libvirt_up == 0
for: 2m
labels:
- severity: down
- service: "libvirt"
+ severity: critical
+ service: libvirt
annotations:
- summary: "libvirt check on '{{ $labels.host }}' is down"
- description: >-
- libvirt check on '{{ $labels.host }}' is down for 2 minutes
+ summary: "Failure to gather Libvirt metrics"
+ description: "The Libvirt metric exporter fails to gather metrics on the {{ $labels.host }} node for at least 2 minutes."
{%- endraw %}
-{%- if exporters is defined %}
{%- include "prometheus/_exporters_config.sls" %}
{%- endif %}
{%- endif %}
-{%- endif %}