Merge "Rework Nova service alerts"
diff --git a/nova/meta/prometheus.yml b/nova/meta/prometheus.yml
index 24cdac6..9029265 100644
--- a/nova/meta/prometheus.yml
+++ b/nova/meta/prometheus.yml
@@ -30,95 +30,103 @@
 server:
   alert:
 {%- if is_controller %}
+{%- set minor_threshold = monitoring.services_failed_warning_threshold_percent|float %}
+{%- set major_threshold = monitoring.services_failed_critical_threshold_percent|float %}
+{%- set minor_compute_threshold = monitoring.computes_failed_warning_threshold_percent|float %}
+{%- set major_compute_threshold = monitoring.computes_failed_critical_threshold_percent|float %}
 {% raw %}
+    NovaAPIOutage:
+      if: >-
+        max(openstack_api_check_status{name=~"nova.*|placement"}) == 0
+      labels:
+        severity: critical
+        service: nova
+      annotations:
+        summary: "Nova API outage"
+        description: >-
+          Nova API is not accessible for all available Nova endpoints in the OpenStack service catalog.
     NovaAPIDown:
       if: >-
-        openstack_api_check_status{service=~"nova.*|placement"} == 0
-      for: 2m
+        openstack_api_check_status{name=~"nova.*|placement"} == 0
       labels:
-        severity: down
-        service: "{{ $labels.service }}"
+        severity: major
+        service: nova
       annotations:
-        summary: "Endpoint check for '{{ $labels.service }}' is down"
+        summary: "{{ $labels.name }} endpoint is not accessible"
         description: >-
-            Endpoint check for '{{ $labels.service }}' is down for the last 2 minutes
+          Nova API is not accessible for the {{ $labels.name }} endpoint.
     NovaAPIServiceDown:
       if: >-
-        http_response_status{service=~"nova-api"} == 0
+        http_response_status{name=~"nova-api"} == 0
       for: 2m
       labels:
-        severity: down
-        service: "{{ $labels.service }}"
+        severity: minor
+        service: nova
       annotations:
-        summary: "HTTP check for '{{ $labels.service }}' down"
+        summary: "Host nova-api endpoint is not accessible"
         description: >-
-            The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes.
-    NovaServicesWarning:
+          The host nova-api endpoint on the {{ $labels.host }} node is not accessible for at least 2 minutes.
+    NovaServiceDown:
       if: >-
-        openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= on (service) sum(openstack_nova_services{service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"}) by (service) * {%- endraw %} {{monitoring.services_failed_warning_threshold_percent}} {%- raw %}
-      for: 2m
+        openstack_nova_service_state == 0
       labels:
-        severity: warning
-        service: "{{ $labels.service }}"
+        severity: minor
+        service: nova
       annotations:
-        summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
+        summary: "{{ $labels.binary }} service is down"
         description: >-
-            More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
-    NovaServicesCritical:
+          The {{ $labels.binary }} service on the {{ $labels.hostname }} node is down.
+{%- endraw %}
+    NovaServicesDownMinor:
       if: >-
-        openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= on (service) sum(openstack_nova_services{service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"}) by (service) * {%- endraw %} {{monitoring.services_failed_critical_threshold_percent}} {%- raw %}
-      for: 2m
+        count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) >= on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * {{minor_threshold}} and count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) < on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * {{major_threshold}}
+      labels:
+        severity: minor
+        service: nova
+      annotations:
+        summary: "{{minor_threshold * 100}}%{%- raw %} of {{ $labels.binary }} services are down"
+        description: >-
+          {{ $value }} {{ $labels.binary }} services are down {%- endraw %}(at least {{minor_threshold * 100}}%).
+    NovaComputeServicesDownMinor:
+      if: >-
+        count(openstack_nova_service_state{binary="nova-compute"} == 0) >= count(openstack_nova_service_state{binary="nova-compute"}) * {{minor_compute_threshold}} and count(openstack_nova_service_state{binary="nova-compute"} == 0) < count(openstack_nova_service_state{binary="nova-compute"}) * {{major_compute_threshold}}
+      labels:
+        severity: minor
+        service: nova
+      annotations:
+        summary: "{{minor_compute_threshold * 100}}%{%- raw %} of nova-compute services are down"
+        description: >-
+          {{ $value }} nova-compute services are down {%- endraw %}(at least {{minor_compute_threshold * 100}}%).
+    NovaServicesDownMajor:
+      if: >-
+        count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) >= on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * {{major_threshold}}
+      labels:
+        severity: major
+        service: nova
+      annotations:
+        summary: "{{major_threshold * 100}}%{%- raw %} of {{ $labels.binary }} services are down"
+        description: >-
+          {{ $value }} {{ $labels.binary }} services are down {%- endraw %}(at least {{major_threshold * 100}}%).
+    NovaComputeServicesDownMajor:
+      if: >-
+        count(openstack_nova_service_state{binary="nova-compute"} == 0) >= count(openstack_nova_service_state{binary="nova-compute"}) * {{major_compute_threshold}}
+      labels:
+        severity: major
+        service: nova
+      annotations:
+        summary: "{{major_compute_threshold * 100}}%{%- raw %} of nova-compute services are down"
+        description: >-
+          {{ $value }} nova-compute services are down {%- endraw %}(at least {{major_compute_threshold * 100}}%).{%- raw %}
+    NovaServiceOutage:
+      if: >-
+        count(openstack_nova_service_state == 0) by (binary) == on (binary) count(openstack_nova_service_state) by (binary)
       labels:
         severity: critical
-        service: "{{ $labels.service }}"
+        service: nova
       annotations:
-        summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
+        summary: "{{ $labels.binary }} service outage"
         description: >-
-            More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
-    NovaServicesDown:
-      if: >-
-        openstack_nova_services{state="up",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} == 0
-      for: 2m
-      labels:
-        severity: down
-        service: "{{ $labels.service }}"
-      annotations:
-        summary: "All {{ $labels.service }} services down"
-        description: >-
-            All '{{ $labels.service }}' services are down for the last 2 minutes
-    NovaComputesWarning:
-      if: >-
-        openstack_nova_services{state="down",service=~"nova-compute"} >= on (service) sum(openstack_nova_services{service=~"nova-compute"}) by (service) * {%- endraw %} {{monitoring.computes_failed_warning_threshold_percent}} {%- raw %}
-      for: 2m
-      labels:
-        severity: warning
-        service: "{{ $labels.service }}"
-      annotations:
-        summary: "More than {%- endraw %} {{monitoring.computes_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
-        description: >-
-            More than {%- endraw %} {{monitoring.computes_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
-    NovaComputesCritical:
-      if: >-
-        openstack_nova_services_percent{state="down",service=~"nova-compute"} >= on (service) sum(openstack_nova_services{service=~"nova-compute"}) by (service) * {%- endraw %} {{monitoring.computes_failed_critical_threshold_percent}} {%- raw %}
-      for: 2m
-      labels:
-        severity: critical
-        service: "{{ $labels.service }}"
-      annotations:
-        summary: "More than {%- endraw %} {{monitoring.computes_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
-        description: >-
-            More than {%- endraw %} {{monitoring.computes_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
-    NovaComputesDown:
-      if: >-
-        openstack_nova_services{state="up",service=~"nova-compute"} == 0
-      for: 2m
-      labels:
-        severity: down
-        service: "{{ $labels.service }}"
-      annotations:
-        summary: "All {{ $labels.service }} services are down"
-        description: >-
-            All '{{ $labels.service }}' services are down for the last 2 minutes
+          All {{ $labels.binary }} services are down.
     NovaTotalFreeVCPUsLow:
       if: >-
         (100.0 * openstack_nova_total_free_vcpus) / (openstack_nova_total_free_vcpus + openstack_nova_total_used_vcpus) < 10.0
@@ -220,27 +228,23 @@
 {%- raw %}
       labels:
         severity: warning
-        service: "{{ $labels.service }}"
+        service: nova
       annotations:
-        summary: 'Too many errors in {{ $labels.service }} logs'
-        description: 'The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }}).'
-
-{%- if is_compute %}
+        summary: "High number of errors in Nova logs"
+        description: "The rate of errors in Nova logs over the last 5 minutes is too high on the {{ $labels.host }} node (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }})."
+{%- if is_compute and exporters is defined %}
 {%- raw %}
-    NovaLibvirtDown:
+    LibvirtDown:
       if: >-
-        max(libvirt_up) by (host) == 0
+        libvirt_up == 0
       for: 2m
       labels:
-        severity: down
-        service: "libvirt"
+        severity: critical
+        service: libvirt
       annotations:
-        summary: "libvirt check on '{{ $labels.host }}' is down"
-        description: >-
-            libvirt check on '{{ $labels.host }}' is down for 2 minutes
+        summary: "Failure to gather Libvirt metrics"
+        description: "The Libvirt metric exporter fails to gather metrics on the {{ $labels.host }} node for at least 2 minutes."
 {%- endraw %}
-{%- if exporters is defined %}
 {%- include "prometheus/_exporters_config.sls" %}
 {%- endif %}
 {%- endif %}
-{%- endif %}