blob: 9029265e0f4d017b7bc51bd738e071d5d426ee45 [file] [log] [blame]
{% from "nova/map.jinja" import controller, compute, monitoring with context %}
{%- set is_controller = controller.get('enabled', False) %}
{%- set is_compute = compute.get('enabled', False) %}
{%- if is_controller or is_compute %}
{%- if is_compute and
exporters is defined %}
{%- set packages = exporters.get('libvirt', {}).get('packages', ('libvirt-exporter', )) %}
{%- load_yaml as new_exporters_cfg %}
exporters:
libvirt:
enabled: true
{%- if packages is defined %}
packages:
{% for pkg in packages %}
- {{ pkg }}
{% endfor %}
{%- endif %}
services:
qemu:
enabled: true
bind:
address: 0.0.0.0
port: 9177
{%- endload %}
{{ new_exporters_cfg|yaml(False) }}
{%- endif %}
server:
alert:
{%- if is_controller %}
{%- set minor_threshold = monitoring.services_failed_warning_threshold_percent|float %}
{%- set major_threshold = monitoring.services_failed_critical_threshold_percent|float %}
{%- set minor_compute_threshold = monitoring.computes_failed_warning_threshold_percent|float %}
{%- set major_compute_threshold = monitoring.computes_failed_critical_threshold_percent|float %}
{% raw %}
NovaAPIOutage:
if: >-
max(openstack_api_check_status{name=~"nova.*|placement"}) == 0
labels:
severity: critical
service: nova
annotations:
summary: "Nova API outage"
description: >-
Nova API is not accessible for all available Nova endpoints in the OpenStack service catalog.
NovaAPIDown:
if: >-
openstack_api_check_status{name=~"nova.*|placement"} == 0
labels:
severity: major
service: nova
annotations:
summary: "{{ $labels.name }} endpoint is not accessible"
description: >-
Nova API is not accessible for the {{ $labels.name }} endpoint.
NovaAPIServiceDown:
if: >-
http_response_status{name=~"nova-api"} == 0
for: 2m
labels:
severity: minor
service: nova
annotations:
summary: "Host nova-api endpoint is not accessible"
description: >-
The host nova-api endpoint on the {{ $labels.host }} node is not accessible for at least 2 minutes.
NovaServiceDown:
if: >-
openstack_nova_service_state == 0
labels:
severity: minor
service: nova
annotations:
summary: "{{ $labels.binary }} service is down"
description: >-
The {{ $labels.binary }} service on the {{ $labels.hostname }} node is down.
{%- endraw %}
NovaServicesDownMinor:
if: >-
count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) >= on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * {{minor_threshold}} and count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) < on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * {{major_threshold}}
labels:
severity: minor
service: nova
annotations:
summary: "{{minor_threshold * 100}}%{%- raw %} of {{ $labels.binary }} services are down"
description: >-
{{ $value }} {{ $labels.binary }} services are down {%- endraw %}(at least {{minor_threshold * 100}}%).
NovaComputeServicesDownMinor:
if: >-
count(openstack_nova_service_state{binary="nova-compute"} == 0) >= count(openstack_nova_service_state{binary="nova-compute"}) * {{minor_compute_threshold}} and count(openstack_nova_service_state{binary="nova-compute"} == 0) < count(openstack_nova_service_state{binary="nova-compute"}) * {{major_compute_threshold}}
labels:
severity: minor
service: nova
annotations:
summary: "{{minor_compute_threshold * 100}}%{%- raw %} of nova-compute services are down"
description: >-
{{ $value }} nova-compute services are down {%- endraw %}(at least {{minor_compute_threshold * 100}}%).
NovaServicesDownMajor:
if: >-
count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) >= on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * {{major_threshold}}
labels:
severity: major
service: nova
annotations:
summary: "{{major_threshold * 100}}%{%- raw %} of {{ $labels.binary }} services are down"
description: >-
{{ $value }} {{ $labels.binary }} services are down {%- endraw %}(at least {{major_threshold * 100}}%).
NovaComputeServicesDownMajor:
if: >-
count(openstack_nova_service_state{binary="nova-compute"} == 0) >= count(openstack_nova_service_state{binary="nova-compute"}) * {{major_compute_threshold}}
labels:
severity: major
service: nova
annotations:
summary: "{{major_compute_threshold * 100}}%{%- raw %} of nova-compute services are down"
description: >-
{{ $value }} nova-compute services are down {%- endraw %}(at least {{major_compute_threshold * 100}}%).{%- raw %}
NovaServiceOutage:
if: >-
count(openstack_nova_service_state == 0) by (binary) == on (binary) count(openstack_nova_service_state) by (binary)
labels:
severity: critical
service: nova
annotations:
summary: "{{ $labels.binary }} service outage"
description: >-
All {{ $labels.binary }} services are down.
NovaTotalFreeVCPUsLow:
if: >-
(100.0 * openstack_nova_total_free_vcpus) / (openstack_nova_total_free_vcpus + openstack_nova_total_used_vcpus) < 10.0
for: 1m
labels:
severity: warning
service: nova
annotations:
summary: "VCPU low limit for new instances"
description: >-
VPCU low limit for 1 minutes
NovaTotalFreeMemoryLow:
if: >-
(100.0 * openstack_nova_total_free_ram) / (openstack_nova_total_free_ram + openstack_nova_total_used_ram) < 10.0
for: 1m
labels:
severity: warning
service: nova
annotations:
summary: "Memory low limit for new instances"
description: >-
Memory low limit for 1 minutes
NovaTotalFreeVCPUsShortage:
if: >-
(100.0 * openstack_nova_total_free_vcpus) / (openstack_nova_total_free_vcpus + openstack_nova_total_used_vcpus) < 2.0
for: 1m
labels:
severity: critical
service: nova
annotations:
summary: "VCPU shortage for new instances"
description: >-
VPCU shortage for 1 minutes
NovaTotalFreeMemoryShortage:
if: >-
(100.0 * openstack_nova_total_free_ram) / (openstack_nova_total_free_ram + openstack_nova_total_used_ram) < 2.0
for: 1m
labels:
severity: critical
service: nova
annotations:
summary: "Memory shortage for new instances"
description: >-
Memory shortage for 1 minutes
NovaAggregatesFreeVCPUsLow:
if: >-
(100.0 * openstack_nova_aggregate_free_vcpus) / (openstack_nova_aggregate_free_vcpus + openstack_nova_aggregate_used_vcpus) < 10.0
for: 1m
labels:
severity: warning
service: nova
aggregate: "{{ $labels.aggregate }}"
annotations:
summary: "VCPU low limit for new instances on aggregate {{ $labels.aggregate }}"
description: >-
VPCU low limit for 1 minutes on aggregate {{ $labels.aggregate }}
NovaAggregatesFreeMemoryLow:
if: >-
(100.0 * openstack_nova_aggregate_free_ram) / (openstack_nova_aggregate_free_ram + openstack_nova_aggregate_used_ram) < 10.0
for: 1m
labels:
severity: warning
service: nova
aggregate: "{{ $labels.aggregate }}"
annotations:
summary: "Memory low limit for new instances on aggregate {{ $labels.aggregate }}"
description: >-
Memory low limit for 1 minutes on aggregate {{ $labels.aggregate }}
NovaAggregatesFreeVCPUsShortage:
if: >-
(100.0 * openstack_nova_aggregate_free_vcpus) / (openstack_nova_aggregate_free_vcpus + openstack_nova_aggregate_used_vcpus) < 2.0
for: 1m
labels:
severity: critical
service: nova
aggregate: "{{ $labels.aggregate }}"
annotations:
summary: "VCPU shortage for new instances on aggregate {{ $labels.aggregate }}"
description: >-
VPCU shortage for 1 minutes on aggregate {{ $labels.aggregate }}
NovaAggregatesFreeMemoryShortage:
if: >-
(100.0 * openstack_nova_aggregate_free_ram) / (openstack_nova_aggregate_free_ram + openstack_nova_aggregate_used_ram) < 2.0
for: 1m
labels:
severity: critical
service: nova
aggregate: "{{ $labels.aggregate }}"
annotations:
summary: "Memory shortage for new instances on aggregate {{ $labels.aggregate }}"
description: >-
Memory shortage for 1 minutes on aggregate {{ $labels.aggregate }}
{%- endraw %}
{%- endif %}
NovaErrorLogsTooHigh:
{%- set log_threshold = monitoring.error_log_rate.warn|float %}
if: >-
sum(rate(log_messages{service="nova",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > {{ log_threshold }}
{%- raw %}
labels:
severity: warning
service: nova
annotations:
summary: "High number of errors in Nova logs"
description: "The rate of errors in Nova logs over the last 5 minutes is too high on the {{ $labels.host }} node (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }})."
{%- if is_compute and exporters is defined %}
{%- raw %}
LibvirtDown:
if: >-
libvirt_up == 0
for: 2m
labels:
severity: critical
service: libvirt
annotations:
summary: "Failure to gather Libvirt metrics"
description: "The Libvirt metric exporter fails to gather metrics on the {{ $labels.host }} node for at least 2 minutes."
{%- endraw %}
{%- include "prometheus/_exporters_config.sls" %}
{%- endif %}
{%- endif %}