Rework Heat alerts
Change-Id: I5b7a7ee1caf5e094de024648dcf6050648485c3c
Related-PROD: PROD-19917
diff --git a/heat/map.jinja b/heat/map.jinja
index 4653825..17b4dd8 100644
--- a/heat/map.jinja
+++ b/heat/map.jinja
@@ -51,7 +51,6 @@
{% set monitoring = salt['grains.filter_by']({
'default': {
'error_log_rate': 0.2,
- 'services_failed_warning_threshold_percent': 0.3,
- 'services_failed_critical_threshold_percent': 0.6,
+ 'endpoint_failed_major_threshold': 0.5,
},
}, grain='os_family', merge=salt['pillar.get']('heat:monitoring')) %}
diff --git a/heat/meta/prometheus.yml b/heat/meta/prometheus.yml
index d518075..d70d223 100644
--- a/heat/meta/prometheus.yml
+++ b/heat/meta/prometheus.yml
@@ -2,75 +2,76 @@
{%- from "heat/map.jinja" import server, monitoring with context %}
{%- if server.get('enabled', False) %}
+{%- set major_threshold = monitoring.endpoint_failed_major_threshold|float %}
{% raw %}
server:
alert:
HeatAPIDown:
if: >-
- openstack_api_check_status{service=~"heat.*"} == 0
+ openstack_api_check_status{name=~"heat.*"} == 0
+ labels:
+ severity: major
+ service: heat
+ annotations:
+ summary: "{{ $labels.name }} endpoint is not accessible"
+ description: >-
+ Heat API is not accessible for the {{ $labels.name }} endpoint.
+ HeatAPIOutage:
+ if: >-
+ max(openstack_api_check_status{name=~"heat.*"}) == 0
+ labels:
+ severity: critical
+ service: heat
+ annotations:
+ summary: "Heat API outage"
+ description: >-
+ Heat API is not accessible for all available Heat endpoints in the OpenStack service catalog.
+ HeatAPIServiceDown:
+ if: >-
+ http_response_status{name=~"heat.*-api"} == 0
for: 2m
labels:
- severity: down
- service: "{{ $labels.service }}"
+ severity: minor
+ service: heat
annotations:
- summary: "Endpoint check for '{{ $labels.service }}' is down"
+ summary: "Host {{ $labels.name }} endpoint is not accessible"
description: >-
- Endpoint check for '{{ $labels.service }}' is down for 2 minutes
- HeatAPIServicesInfo:
+ The host {{ $labels.name }} endpoint on the {{ $labels.host }} node is not accessible for at least 2 minutes.
+{%- endraw %}
+ HeatAPIServiceDownMajor:
if: >-
- http_response_status{service=~"heat.*-api"} == 0
+ count(http_response_status{name=~"heat.*-api"} == 0) by (name) >= count(http_response_status{name=~"heat.*-api"}) by (name) * {{ major_threshold }}
for: 2m
labels:
- severity: info
- service: "{{ $labels.service }}"
+ severity: major
+ service: heat
annotations:
- summary: "HTTP check for '{{ $labels.service }}' down"
+ summary: "{{major_threshold * 100}}% of host {% raw %}{{ $labels.name }} endpoints are not accessible"
description: >-
- The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes.
- HeatAPIServicesWarning:
+ {{ $value }} host {{ $labels.name }} endpoints are not accessible for at least 2 minutes (at least {% endraw %}{{major_threshold * 100}}{% raw %}%).
+ HeatAPIServiceOutage:
if: >-
- count(http_response_status{service=~"heat.*-api"} == 0) by (service) >= on (service) count(http_response_status{service=~"heat.*-api"}) by (service) * {%- endraw %} {{monitoring.services_failed_warning_threshold_percent}} {%- raw %}
- for: 2m
- labels:
- severity: warning
- service: "{{ $labels.service }}"
- annotations:
- summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
- description: >-
- {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})
- HeatAPIServicesCritical:
- if: >-
- count(http_response_status{service=~"heat.*-api"} == 0) by (service) >= on (service) count(http_response_status{service=~"heat.*-api"}) by (service) * {%- endraw %} {{monitoring.services_failed_critical_threshold_percent}} {%- raw %}
+ count(http_response_status{name=~"heat.*-api"} == 0) by (name) == count(http_response_status{name=~"heat.*-api"}) by (name)
for: 2m
labels:
severity: critical
- service: "{{ $labels.service }}"
+ service: heat
annotations:
- summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
+ summary: "Host {{ $labels.name }} outage"
description: >-
- {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})
- HeatAPIServicesDown:
- if: >-
- count(http_response_status{service=~"heat.*-api"} == 0) by (service) == on (service) count(http_response_status{service=~"heat.*-api"}) by (service)
- for: 2m
- labels:
- severity: down
- service: "{{ $labels.service }}"
- annotations:
- summary: "All {{ $labels.service }} services are down"
- description: >-
- All {{ $labels.service }} services are down for the last 2 minutes
- HeatErrorLogsTooHigh:
+ All available host {{ $labels.name }} endpoints are not accessible for at least 2 minutes.
{%- endraw %}
+ HeatErrorLogsTooHigh:
{%- set log_threshold = monitoring.error_log_rate|float %}
if: >-
sum(rate(log_messages{service="heat",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > {{ log_threshold }}
{%- raw %}
labels:
severity: warning
- service: "{{ $labels.service }}"
+ service: heat
annotations:
- summary: 'Too many errors in {{ $labels.service }} logs'
- description: 'The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }}).'
+ summary: "High number of errors in Heat logs"
+ description: "The average per-second rate of errors in Heat logs on the {{ $labels.host }} node is {{ $value }} (as measured over the last 5 minutes)."
+{%- endraw %}
{%- endif %}
{%- endif %}