| {%- if pillar.heat.server is defined %} |
| |
| {%- from "heat/map.jinja" import server, monitoring with context %} |
| {%- if server.get('enabled', False) %} |
| {%- set major_threshold = monitoring.endpoint_failed_major_threshold|float %} |
| {% raw %} |
| server: |
| alert: |
| HeatApiDown: |
| if: >- |
| openstack_api_check_status{name=~"heat.*"} == 0 |
| labels: |
| severity: major |
| service: heat |
| annotations: |
| summary: "{{ $labels.name }} endpoint is not accessible" |
| description: >- |
| Heat API is not accessible for the {{ $labels.name }} endpoint. |
| HeatApiOutage: |
| if: >- |
| max(openstack_api_check_status{name=~"heat.*"}) == 0 |
| labels: |
| severity: critical |
| service: heat |
| annotations: |
| summary: "Heat API outage" |
| description: >- |
| Heat API is not accessible for all available Heat endpoints in the OpenStack service catalog. |
| HeatApiEndpointDown: |
| if: >- |
| http_response_status{name=~"heat.*-api"} == 0 |
| for: 2m |
| labels: |
| severity: minor |
| service: heat |
| annotations: |
| summary: "{{ $labels.name }} endpoint is not accessible" |
| description: >- |
| The {{ $labels.name }} endpoint on the {{ $labels.host }} node is not accessible for 2 minutes. |
| {%- endraw %} |
| HeatApiEndpointsDownMajor: |
| if: >- |
| count(http_response_status{name=~"heat.*-api"} == 0) by (name) >= count(http_response_status{name=~"heat.*-api"}) by (name) * {{ major_threshold }} |
| for: 2m |
| labels: |
| severity: major |
| service: heat |
| annotations: |
| summary: "{{major_threshold * 100}}% of {% raw %}{{ $labels.name }} endpoints are not accessible" |
| description: >- |
| {{ $value }} {{ $labels.name }} endpoints (>= {% endraw %}{{major_threshold * 100}}{% raw %}%) are not accessible for 2 minutes. |
| HeatApiEndpointsOutage: |
| if: >- |
| count(http_response_status{name=~"heat.*-api"} == 0) by (name) == count(http_response_status{name=~"heat.*-api"}) by (name) |
| for: 2m |
| labels: |
| severity: critical |
| service: heat |
| annotations: |
| summary: "{{ $labels.name }} endpoints outage" |
| description: >- |
| All available {{ $labels.name }} endpoints are not accessible for 2 minutes. |
| {%- endraw %} |
| HeatErrorLogsTooHigh: |
| {%- set log_threshold = monitoring.error_log_rate|float %} |
| if: >- |
| sum(rate(log_messages{service="heat",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > {{ log_threshold }} |
| {%- raw %} |
| labels: |
| severity: warning |
| service: heat |
| annotations: |
| summary: "High number of errors in Heat logs" |
| description: "The average per-second rate of errors in Heat logs on the {{ $labels.host }} node is {{ $value }} (as measured over the last 5 minutes)." |
| {%- endraw %} |
| {%- endif %} |
| {%- endif %} |