Rework Keystone alerts

Change-Id: Ifb0af8382817d6bee452b59800a343c73596d1ed
diff --git a/keystone/map.jinja b/keystone/map.jinja
index 6fdcdde..d65b795 100644
--- a/keystone/map.jinja
+++ b/keystone/map.jinja
@@ -93,6 +93,7 @@
             'percentage': 50,
             'all_auths_rate': 0.1,
         },
+        'endpoint_failed_major_threshold': 0.5,
         'telegraf_openstack_input_plugin_interval': '3m',
         'telegraf_openstack_output_plugin_expiration_interval': '6m',
         'output_openstack_port': 9127,
diff --git a/keystone/meta/prometheus.yml b/keystone/meta/prometheus.yml
index 078f98b..4d2c915 100644
--- a/keystone/meta/prometheus.yml
+++ b/keystone/meta/prometheus.yml
@@ -10,30 +10,53 @@
           - 'tasks.monitoring_remote_agent'
           type: A
           port: '{{ monitoring.output_openstack_port }}'
+{%- set major_threshold = monitoring.endpoint_failed_major_threshold|float %}
 {% raw %}
   alert:
-    KeystoneAPIDown:
+    KeystoneAPIOutage:
       if: >-
-        openstack_api_check_status{service=~"keystone.*"} == 0
-      for: 2m
+        openstack_api_check_status{name=~"keystone.*"} == 0
       labels:
-        severity: down
-        service: "{{ $labels.service }}"
+        severity: critical
+        service: keystone
       annotations:
-        summary: "Endpoint check for '{{ $labels.service }}' is down"
+        summary: "Keystone API outage"
         description: >-
-            Endpoint check for '{{ $labels.service }}' is down for 2 minutes
+          Keystone API is not accessible for the Keystone endpoint in the OpenStack service catalog.
     KeystoneAPIServiceDown:
       if: >-
-        http_response_status{service=~"keystone.*"} == 0
+        http_response_status{name=~"keystone.*"} == 0
       for: 2m
       labels:
-        severity: down
-        service: "{{ $labels.service }}"
+        severity: minor
+        service: keystone
       annotations:
-        summary: "HTTP check for '{{ $labels.service }}' down"
+        summary: "Host {{ $labels.name }} endpoint is not accessible"
         description: >-
-            The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for 2 minutes.
+          The host {{ $labels.name }} endpoint on the {{ $labels.host }} node is not accessible for at least 2 minutes.
+{%- endraw %}
+    KeystoneAPIServicesDownMajor:
+      if: >-
+        count(http_response_status{name=~"keystone.*"} == 0) by (name) >= count(http_response_status{name=~"keystone.*"}) by (name) * {{ major_threshold }}
+      for: 2m
+      labels:
+        severity: major
+        service: keystone
+      annotations:
+        summary: "{{major_threshold * 100}}% of host {% raw %}{{ $labels.name }} endpoints are not accessible"
+        description: >-
+          {{ $value }} host {{ $labels.name }} endpoints are not accessible for at least 2 minutes (at least {% endraw %}{{major_threshold * 100}}{% raw %}%).
+    KeystoneAPIServiceOutage:
+      if: >-
+        count(http_response_status{name=~"keystone.*"} == 0) by (name) == count(http_response_status{name=~"keystone.*"}) by (name)
+      for: 2m
+      labels:
+        severity: critical
+        service: keystone
+      annotations:
+        summary: "Host {{ $labels.name }} outage"
+        description: >-
+          All available host {{ $labels.name }} endpoints are not accessible for at least 2 minutes.
     KeystoneErrorLogsTooHigh:
 {%- endraw %}
       {%- set log_threshold = monitoring.error_log_rate|float %}
@@ -42,11 +65,12 @@
 {%- raw %}
       labels:
         severity: warning
-        service: "{{ $labels.service }}"
+        service: keystone
       annotations:
-        summary: 'Too many errors in {{ $labels.service }} logs'
-        description: 'The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }}).'
-    KeystoneAPITooSlow:
+        summary: "High number of errors in Keystone logs"
+        description: "The average per-second rate of errors in Keystone logs on the {{ $labels.host }} node is {{ $value }} (as measured over the last 5 minutes)."
+{%- endraw %}
+    KeystoneAPIResponseTimeTooHigh:
       {%- set response_time_threshold = monitoring.http_response_time_p90|float %}
       if: >-
         max by(host) (openstack_http_response_times{service='keystone',quantile="0.9",http_method=~"^(GET|POST)$",http_status=~"^2..$"}) >= {{ response_time_threshold }}
@@ -56,7 +80,6 @@
         severity: warning
         service: keystone
       annotations:
-        summary: 'Keystone API too slow'
-        description: 'The 90th percentile of the Keystone API response times for GET and POST requests is too high on node {{ $labels.host }} (current value={{ $value }}s, threshold={%- endraw %}{{ response_time_threshold }}s).'
-
+        summary: "High response time of Keystone API"
+        description: "The Keystone API response time for GET and POST requests on the {{ $labels.host }} node is higher than {% endraw %}{{response_time_threshold}}s for at least 2 minutes."
 {%- endif %}