blob: 48be8c2cd31068e04890809d342d25bd40ab07d0 [file] [log] [blame]
{%- if pillar.heat.server is defined %}
{%- from "heat/map.jinja" import server, monitoring with context %}
{%- if server.get('enabled', False) %}
{%- set major_threshold = monitoring.endpoint_failed_major_threshold|float %}
{% raw %}
server:
alert:
HeatApiDown:
if: >-
openstack_api_check_status{name=~"heat.*"} == 0
labels:
severity: major
service: heat
annotations:
summary: "{{ $labels.name }} endpoint is not accessible"
description: >-
Heat API is not accessible for the {{ $labels.name }} endpoint.
HeatApiOutage:
if: >-
max(openstack_api_check_status{name=~"heat.*"}) == 0
labels:
severity: critical
service: heat
annotations:
summary: "Heat API outage"
description: >-
Heat API is not accessible for all available Heat endpoints in the OpenStack service catalog.
HeatApiEndpointDown:
if: >-
http_response_status{name=~"heat.*-api"} == 0
for: 2m
labels:
severity: minor
service: heat
annotations:
summary: "{{ $labels.name }} endpoint is not accessible"
description: >-
The {{ $labels.name }} endpoint on the {{ $labels.host }} node is not accessible for 2 minutes.
{%- endraw %}
HeatApiEndpointsDownMajor:
if: >-
count(http_response_status{name=~"heat.*-api"} == 0) by (name) >= count(http_response_status{name=~"heat.*-api"}) by (name) * {{ major_threshold }}
for: 2m
labels:
severity: major
service: heat
annotations:
summary: "{{major_threshold * 100}}% of {% raw %}{{ $labels.name }} endpoints are not accessible"
description: >-
{{ $value }} {{ $labels.name }} endpoints (>= {% endraw %}{{major_threshold * 100}}{% raw %}%) are not accessible for 2 minutes.
HeatApiEndpointsOutage:
if: >-
count(http_response_status{name=~"heat.*-api"} == 0) by (name) == count(http_response_status{name=~"heat.*-api"}) by (name)
for: 2m
labels:
severity: critical
service: heat
annotations:
summary: "{{ $labels.name }} endpoints outage"
description: >-
All available {{ $labels.name }} endpoints are not accessible for 2 minutes.
{%- endraw %}
HeatErrorLogsTooHigh:
{%- set log_threshold = monitoring.error_log_rate|float %}
if: >-
sum(rate(log_messages{service="heat",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > {{ log_threshold }}
{%- raw %}
labels:
severity: warning
service: heat
annotations:
summary: "High number of errors in Heat logs"
description: "The average per-second rate of errors in Heat logs on the {{ $labels.host }} node is {{ $value }} (as measured over the last 5 minutes)."
{%- endraw %}
{%- endif %}
{%- endif %}