Add threshold for nova services alerts

Could be configured through new variables:
 * services_failed_warning_threshold_percent
 * services_failed_critical_threshold_percent
 * computes_failed_warning_threshold_percent
 * computes_failed_critical_threshold_percent

Change-Id: I7cf5f00f4384776661dde5e07883449c37715ae2
Closes-Bug: PROD-15201
diff --git a/nova/map.jinja b/nova/map.jinja
index f725112..d8610f6 100644
--- a/nova/map.jinja
+++ b/nova/map.jinja
@@ -114,5 +114,9 @@
         'error_log_rate': {
               'warn': 0.2,
         },
+        'services_failed_warning_threshold_percent': 0.3,
+        'services_failed_critical_threshold_percent': 0.6,
+        'computes_failed_warning_threshold_percent': 0.25,
+        'computes_failed_critical_threshold_percent': 0.5,
     },
 }, grain='os_family', merge=salt['pillar.get']('nova:monitoring')) %}
diff --git a/nova/meta/prometheus.yml b/nova/meta/prometheus.yml
index c9d1a8d..8599418 100644
--- a/nova/meta/prometheus.yml
+++ b/nova/meta/prometheus.yml
@@ -41,7 +41,7 @@
       annotations:
         summary: "Endpoint check for '{{ $labels.service }}' is down"
         description: >-
-            Endpoint check for '{{ $labels.service }}' is down for 2 minutes
+            Endpoint check for '{{ $labels.service }}' is down for the last 2 minutes
     NovaAPIServiceDown:
       if: >-
         http_response_status{service=~"nova-api"} == 0
@@ -52,30 +52,30 @@
       annotations:
         summary: "HTTP check for '{{ $labels.service }}' down"
         description: >-
-            The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for 2 minutes.
-    NovaSomeServicesDown:
+            The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes.
+    NovaServicesWarning:
       if: >-
-        openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} > 0 and ignoring(state) openstack_nova_services{state="up",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= 2
+        openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= on (service) sum(openstack_nova_services{service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"}) by (service) * {%- endraw %} {{monitoring.services_failed_warning_threshold_percent}} {%- raw %}
       for: 2m
       labels:
         severity: warning
         service: "{{ $labels.service }}"
       annotations:
-        summary: "Some {{ $labels.service }} services down"
+        summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
         description: >-
-            {{ $value }} '{{ $labels.service }}' service(s) is/are down for 2 minutes
-    NovaOnlyOneServiceUp:
+            More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
+    NovaServicesCritical:
       if: >-
-        openstack_nova_services{state="up",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} == 1 and ignoring(state) openstack_nova_services{state=~"down|disabled",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} > 0
+        openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= on (service) sum(openstack_nova_services{service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"}) by (service) * {%- endraw %} {{monitoring.services_failed_critical_threshold_percent}} {%- raw %}
       for: 2m
       labels:
         severity: critical
         service: "{{ $labels.service }}"
       annotations:
-        summary: "Only one {{ $labels.service }} service up"
+        summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
         description: >-
-            Only one '{{ $labels.service }}' service is up for 2 minutes
-    NovaAllServicesDown:
+            More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
+    NovaServicesDown:
       if: >-
         openstack_nova_services{state="up",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} == 0
       for: 2m
@@ -85,30 +85,30 @@
       annotations:
         summary: "All {{ $labels.service }} services down"
         description: >-
-            All '{{ $labels.service }}' services are down for 2 minutes
-    NovaSomeComputesDown:
+            All '{{ $labels.service }}' services are down for the last 2 minutes
+    NovaComputesWarning:
       if: >-
-        openstack_nova_services{state="down",service=~"nova-compute"} > 0
+        openstack_nova_services{state="down",service=~"nova-compute"} >= on (service) sum(openstack_nova_services{service=~"nova-compute"}) by (service) * {%- endraw %} {{monitoring.computes_failed_warning_threshold_percent}} {%- raw %}
       for: 2m
       labels:
         severity: warning
         service: "{{ $labels.service }}"
       annotations:
-        summary: "Some {{ $labels.service }} services down"
+        summary: "More than {%- endraw %} {{monitoring.computes_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
         description: >-
-            {{ $value }} '{{ $labels.service }}' service(s) is/are down for 2 minutes
-    NovaMajorityComputesDown:
+            More than {%- endraw %} {{monitoring.computes_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
+    NovaComputesCritical:
       if: >-
-        openstack_nova_services_percent{state="down",service=~"nova-compute"} > 50
+        openstack_nova_services_percent{state="down",service=~"nova-compute"} >= on (service) sum(openstack_nova_services{service=~"nova-compute"}) by (service) * {%- endraw %} {{monitoring.computes_failed_critical_threshold_percent}} {%- raw %}
       for: 2m
       labels:
         severity: critical
         service: "{{ $labels.service }}"
       annotations:
-        summary: "Only one {{ $labels.service }} service up"
+        summary: "More than {%- endraw %} {{monitoring.computes_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
         description: >-
-            Only one '{{ $labels.service }}' service is up for 2 minutes
-    NovaAllComputesDown:
+            More than {%- endraw %} {{monitoring.computes_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down for the last 2 minutes
+    NovaComputesDown:
       if: >-
         openstack_nova_services{state="up",service=~"nova-compute"} == 0
       for: 2m
@@ -116,9 +116,9 @@
         severity: down
         service: "{{ $labels.service }}"
       annotations:
-        summary: "All {{ $labels.service }} services down"
+        summary: "All {{ $labels.service }} services are down"
         description: >-
-            All '{{ $labels.service }}' services are down for 2 minutes
+            All '{{ $labels.service }}' services are down for the last 2 minutes
     NovaTotalFreeVCPUsLow:
       if: >-
         (100.0 * openstack_nova_total_free_vcpus) / (openstack_nova_total_free_vcpus + openstack_nova_total_used_vcpus) < 10.0