blob: dc3d01c619091a8e22291541396d25e5233388dd [file] [log] [blame]
{%- if pillar.heat.server is defined %}
{%- from "heat/map.jinja" import server with context %}
{%- if server.get('enabled', False) %}
{% raw %}
server:
alert:
HeatAPIDown:
if: >-
max(openstack_api_check_status{service=~"heat.+"}) by (service) == 0
for: 2m
labels:
severity: down
service: "{{ $labels.service }}"
annotations:
summary: "Endpoint check for '{{ $labels.service }}' is down"
description: >-
Endpoint check for '{{ $labels.service }}' is down for 2 minutes
HeatErrorLogsTooHigh:
{%- endraw %}
{%- set log_threshold = prometheus_server.get('alert', {}).get('HeatErrorLogsTooHigh', {}).get('var', {}).get('threshold', 0.2 ) %}
if: >-
sum(rate(log_messages{service="heat",level=~"error|emergency|fatal"}[5m])) without (level) > {{ log_threshold }}
{%- raw %}
labels:
severity: warning
service: "{{ $labels.service }}"
annotations:
summary: 'Too many errors in {{ $labels.service }} logs'
description: 'The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }}).'
{%- endif %}
{%- endif %}