Rework Heat alerts Change-Id: I5b7a7ee1caf5e094de024648dcf6050648485c3c Related-PROD: PROD-19917

commit: 93d6d0901254314a7b6c59e83707a6b5afab6a86 [log] [tgz]
author: Ildar Svetlov <isvetlov@mirantis.com> Mon May 14 16:09:47 2018 +0400
committer: Ildar Svetlov <isvetlov@mirantis.com> Thu May 17 12:01:38 2018 +0400
tree: 157aa1f3f5bd94ce611c9b2ab87faab880e8b638
parent: 9a85022c19b8018f40e392a5ff22e2442a3bc020 [diff]
diff --git a/heat/map.jinja b/heat/map.jinja
index 4653825..17b4dd8 100644
--- a/heat/map.jinja
+++ b/heat/map.jinja

@@ -51,7 +51,6 @@
 {% set monitoring = salt['grains.filter_by']({
     'default': {
         'error_log_rate': 0.2,
-        'services_failed_warning_threshold_percent': 0.3,
-        'services_failed_critical_threshold_percent': 0.6,
+        'endpoint_failed_major_threshold': 0.5,
     },
 }, grain='os_family', merge=salt['pillar.get']('heat:monitoring')) %}

diff --git a/heat/meta/prometheus.yml b/heat/meta/prometheus.yml
index d518075..d70d223 100644
--- a/heat/meta/prometheus.yml
+++ b/heat/meta/prometheus.yml

@@ -2,75 +2,76 @@
 
 {%- from "heat/map.jinja" import server, monitoring with context %}
 {%- if server.get('enabled', False) %}
+{%- set major_threshold = monitoring.endpoint_failed_major_threshold|float %}
 {% raw %}
 server:
   alert:
     HeatAPIDown:
       if: >-
-        openstack_api_check_status{service=~"heat.*"} == 0
+        openstack_api_check_status{name=~"heat.*"} == 0
+      labels:
+        severity: major
+        service: heat
+      annotations:
+        summary: "{{ $labels.name }} endpoint is not accessible"
+        description: >-
+          Heat API is not accessible for the {{ $labels.name }} endpoint.
+    HeatAPIOutage:
+      if: >-
+        max(openstack_api_check_status{name=~"heat.*"}) == 0
+      labels:
+        severity: critical
+        service: heat
+      annotations:
+        summary: "Heat API outage"
+        description: >-
+          Heat API is not accessible for all available Heat endpoints in the OpenStack service catalog.
+    HeatAPIServiceDown:
+      if: >-
+        http_response_status{name=~"heat.*-api"} == 0
       for: 2m
       labels:
-        severity: down
-        service: "{{ $labels.service }}"
+        severity: minor
+        service: heat
       annotations:
-        summary: "Endpoint check for '{{ $labels.service }}' is down"
+        summary: "Host {{ $labels.name }} endpoint is not accessible"
         description: >-
-            Endpoint check for '{{ $labels.service }}' is down for 2 minutes
-    HeatAPIServicesInfo:
+          The host {{ $labels.name }} endpoint on the {{ $labels.host }} node is not accessible for at least 2 minutes.
+{%- endraw %}
+    HeatAPIServiceDownMajor:
       if: >-
-        http_response_status{service=~"heat.*-api"} == 0
+        count(http_response_status{name=~"heat.*-api"} == 0) by (name) >= count(http_response_status{name=~"heat.*-api"}) by (name) * {{ major_threshold }}
       for: 2m
       labels:
-        severity: info
-        service: "{{ $labels.service }}"
+        severity: major
+        service: heat
       annotations:
-        summary: "HTTP check for '{{ $labels.service }}' down"
+        summary: "{{major_threshold * 100}}% of host {% raw %}{{ $labels.name }} endpoints are not accessible"
         description: >-
-            The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes.
-    HeatAPIServicesWarning:
+          {{ $value }} host {{ $labels.name }} endpoints are not accessible for at least 2 minutes (at least {% endraw %}{{major_threshold * 100}}{% raw %}%).
+    HeatAPIServiceOutage:
       if: >-
-          count(http_response_status{service=~"heat.*-api"} == 0) by (service) >= on (service) count(http_response_status{service=~"heat.*-api"}) by (service) * {%- endraw %} {{monitoring.services_failed_warning_threshold_percent}} {%- raw %}
-      for: 2m
-      labels:
-        severity: warning
-        service: "{{ $labels.service }}"
-      annotations:
-        summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
-        description: >-
-            {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})
-    HeatAPIServicesCritical:
-      if: >-
-          count(http_response_status{service=~"heat.*-api"} == 0) by (service) >= on (service) count(http_response_status{service=~"heat.*-api"}) by (service) * {%- endraw %} {{monitoring.services_failed_critical_threshold_percent}} {%- raw %}
+        count(http_response_status{name=~"heat.*-api"} == 0) by (name) == count(http_response_status{name=~"heat.*-api"}) by (name)
       for: 2m
       labels:
         severity: critical
-        service: "{{ $labels.service }}"
+        service: heat
       annotations:
-        summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
+        summary: "Host {{ $labels.name }} outage"
         description: >-
-            {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})
-    HeatAPIServicesDown:
-      if: >-
-        count(http_response_status{service=~"heat.*-api"} == 0) by (service) == on (service) count(http_response_status{service=~"heat.*-api"}) by (service)
-      for: 2m
-      labels:
-        severity: down
-        service: "{{ $labels.service }}"
-      annotations:
-        summary: "All {{ $labels.service }} services are down"
-        description: >-
-            All {{ $labels.service }} services are down for the last 2 minutes
-    HeatErrorLogsTooHigh:
+          All available host {{ $labels.name }} endpoints are not accessible for at least 2 minutes.
 {%- endraw %}
+    HeatErrorLogsTooHigh:
       {%- set log_threshold = monitoring.error_log_rate|float %}
       if: >-
         sum(rate(log_messages{service="heat",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > {{ log_threshold }}
 {%- raw %}
       labels:
         severity: warning
-        service: "{{ $labels.service }}"
+        service: heat
       annotations:
-        summary: 'Too many errors in {{ $labels.service }} logs'
-        description: 'The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }}).'
+        summary: "High number of errors in Heat logs"
+        description: "The average per-second rate of errors in Heat logs on the {{ $labels.host }} node is {{ $value }} (as measured over the last 5 minutes)."
+{%- endraw %}
 {%- endif %}
 {%- endif %}
commit	93d6d0901254314a7b6c59e83707a6b5afab6a86	[log] [tgz]
author	Ildar Svetlov <isvetlov@mirantis.com>	Mon May 14 16:09:47 2018 +0400
committer	Ildar Svetlov <isvetlov@mirantis.com>	Thu May 17 12:01:38 2018 +0400
tree	157aa1f3f5bd94ce611c9b2ab87faab880e8b638
parent	9a85022c19b8018f40e392a5ff22e2442a3bc020 [diff]