blob: 532b3d5305f379d9ad728834a49e3df162fd29b3 [file] [log] [blame]
{% from "nova/map.jinja" import controller, compute, monitoring with context %}
{%- set is_controller = controller.get('enabled', False) %}
{%- set is_compute = compute.get('enabled', False) %}
{%- if is_controller or is_compute %}
{%- if is_compute and
exporters is defined %}
{%- set packages = exporters.get('libvirt', {}).get('packages', ('libvirt-exporter', )) %}
{%- load_yaml as new_exporters_cfg %}
exporters:
libvirt:
enabled: true
{%- if packages is defined %}
packages:
{% for pkg in packages %}
- {{ pkg }}
{% endfor %}
{%- endif %}
services:
qemu:
enabled: true
bind:
address: 0.0.0.0
port: 9177
{%- endload %}
{{ new_exporters_cfg|yaml(False) }}
{%- endif %}
server:
alert:
{%- if is_controller %}
{%- set minor_threshold = monitoring.services_failed_warning_threshold_percent|float %}
{%- set major_threshold = monitoring.services_failed_critical_threshold_percent|float %}
{%- set minor_compute_threshold = monitoring.computes_failed_warning_threshold_percent|float %}
{%- set major_compute_threshold = monitoring.computes_failed_critical_threshold_percent|float %}
{%- set major_endpoint_threshold = monitoring.endpoint_failed_major_threshold|float %}
{% raw %}
NovaAPIOutage:
if: >-
max(openstack_api_check_status{name=~"nova.*|placement"}) == 0
labels:
severity: critical
service: nova
annotations:
summary: "Nova API outage"
description: >-
Nova API is not accessible for all available Nova endpoints in the OpenStack service catalog.
NovaAPIDown:
if: >-
openstack_api_check_status{name=~"nova.*|placement"} == 0
labels:
severity: major
service: nova
annotations:
summary: "{{ $labels.name }} endpoint is not accessible"
description: >-
Nova API is not accessible for the {{ $labels.name }} endpoint.
NovaAPIServiceDown:
if: >-
http_response_status{name=~"nova-api"} == 0
for: 2m
labels:
severity: minor
service: nova
annotations:
summary: "Host nova-api endpoint is not accessible"
description: >-
The host nova-api endpoint on the {{ $labels.host }} node is not accessible for at least 2 minutes.
{%- endraw %}
NovaAPIServiceDownMajor:
if: >-
count(http_response_status{name=~"nova-api"} == 0) >= count(http_response_status{name=~"nova-api"}) * {{ major_endpoint_threshold }}
for: 2m
labels:
severity: major
service: nova
annotations:
summary: "{{major_endpoint_threshold * 100}}% of host nova-api endpoints are not accessible"
description: >-
{% raw %}{{ $value }} host nova-api endpoints are not accessible for at least 2 minutes (at least {% endraw %}{{major_endpoint_threshold * 100}}{% raw %}%).
NovaAPIServiceOutage:
if: >-
count(http_response_status{name=~"nova-api"} == 0) == count(http_response_status{name=~"nova-api"})
for: 2m
labels:
severity: critical
service: nova
annotations:
summary: "Host nova-api outage"
description: >-
All available host nova-api endpoints are not accessible for at least 2 minutes.
NovaServiceDown:
if: >-
openstack_nova_service_state == 0
labels:
severity: minor
service: nova
annotations:
summary: "{{ $labels.binary }} service is down"
description: >-
The {{ $labels.binary }} service on the {{ $labels.hostname }} node is down.
{%- endraw %}
NovaServicesDownMinor:
if: >-
count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) >= on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * {{minor_threshold}}
labels:
severity: minor
service: nova
annotations:
summary: "{{minor_threshold * 100}}%{%- raw %} of {{ $labels.binary }} services are down"
description: >-
{{ $value }} {{ $labels.binary }} services are down {%- endraw %}(at least {{minor_threshold * 100}}%).
NovaComputeServicesDownMinor:
if: >-
count(openstack_nova_service_state{binary="nova-compute"} == 0) >= count(openstack_nova_service_state{binary="nova-compute"}) * {{minor_compute_threshold}}
labels:
severity: minor
service: nova
annotations:
summary: "{{minor_compute_threshold * 100}}%{%- raw %} of nova-compute services are down"
description: >-
{{ $value }} nova-compute services are down {%- endraw %}(at least {{minor_compute_threshold * 100}}%).
NovaServicesDownMajor:
if: >-
count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) >= on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * {{major_threshold}}
labels:
severity: major
service: nova
annotations:
summary: "{{major_threshold * 100}}%{%- raw %} of {{ $labels.binary }} services are down"
description: >-
{{ $value }} {{ $labels.binary }} services are down {%- endraw %}(at least {{major_threshold * 100}}%).
NovaComputeServicesDownMajor:
if: >-
count(openstack_nova_service_state{binary="nova-compute"} == 0) >= count(openstack_nova_service_state{binary="nova-compute"}) * {{major_compute_threshold}}
labels:
severity: major
service: nova
annotations:
summary: "{{major_compute_threshold * 100}}%{%- raw %} of nova-compute services are down"
description: >-
{{ $value }} nova-compute services are down {%- endraw %}(at least {{major_compute_threshold * 100}}%).{%- raw %}
NovaServiceOutage:
if: >-
count(openstack_nova_service_state == 0) by (binary) == on (binary) count(openstack_nova_service_state) by (binary)
labels:
severity: critical
service: nova
annotations:
summary: "{{ $labels.binary }} service outage"
description: >-
All {{ $labels.binary }} services are down.
{%- endraw -%}
{%- set cpu_minor_threshold = monitoring.cpu_minor_threshold|float %}
{%- set cpu_major_threshold = monitoring.cpu_major_threshold|float %}
{%- set ram_major_threshold = monitoring.ram_major_threshold|float %}
{%- set ram_critical_threshold = monitoring.ram_critical_threshold|float %}
{%- set disk_major_threshold = monitoring.disk_major_threshold|float %}
{%- set disk_critical_threshold = monitoring.disk_critical_threshold|float -%}
NovaHypervisorVCPUsFullMinor:
if: >-
label_replace(system_load15, "hostname", "$1", "host", "(.*)") >= on (hostname) openstack_nova_vcpus * {{ cpu_minor_threshold }}
labels:
severity: minor
service: nova
annotations:
summary: "{{ cpu_minor_threshold * 100 }}% of hypervisor VCPUs were used"
description: "{% raw %}{{ $value }} VCPUs on the {{ $labels.hostname }}{% endraw %} node were used (at least {{ cpu_minor_threshold * 100 }}%)."
NovaHypervisorVCPUsFullMajor:
if: >-
label_replace(system_load15, "hostname", "$1", "host", "(.*)") >= on (hostname) openstack_nova_vcpus * {{ cpu_major_threshold }}
labels:
severity: major
service: nova
annotations:
summary: "{{ cpu_major_threshold * 100 }}% of hypervisor VCPUs were used"
description: "{% raw %}{{ $value }} VCPUs on the {{ $labels.hostname }}{% endraw %} node were used (at least {{ cpu_major_threshold * 100 }}%)."
NovaHypervisorMemoryFullMajor:
if: >-
openstack_nova_used_ram >= openstack_nova_ram * {{ ram_major_threshold }}
labels:
severity: major
service: nova
annotations:
summary: "{{ ram_major_threshold * 100 }}% of hypervisor RAM was used"
description: "{% raw %}{{ $value }}MB of RAM on the {{ $labels.hostname }}{% endraw %} node was used (at least {{ ram_major_threshold * 100 }}%)."
NovaHypervisorMemoryFullCritical:
if: >-
openstack_nova_used_ram >= openstack_nova_ram * {{ ram_critical_threshold }}
labels:
severity: critical
service: nova
annotations:
summary: "{{ ram_critical_threshold * 100 }}% of hypervisor RAM was used"
description: "{% raw %}{{ $value }}MB of RAM on the {{ $labels.hostname }}{% endraw %} node was used (at least {{ ram_critical_threshold * 100 }}%)."
NovaHypervisorDiskFullMajor:
if: >-
openstack_nova_used_disk >= openstack_nova_disk * {{ disk_major_threshold }}
labels:
severity: major
service: nova
annotations:
summary: "{{ disk_major_threshold * 100 }}% of hypervisor disk space was used"
description: "{% raw %}{{ $value }}GB of disk space on the {{ $labels.hostname }}{% endraw %} node was used (at least {{ disk_major_threshold * 100 }}%)."
NovaHypervisorDiskFullCritical:
if: >-
openstack_nova_used_disk >= openstack_nova_disk * {{ disk_critical_threshold }}
labels:
severity: critical
service: nova
annotations:
summary: "{{ disk_critical_threshold * 100 }}% of hypervisor disk space was used"
description: "{% raw %}{{ $value }}GB of disk space on the {{ $labels.hostname }}{% endraw %} node was used (at least {{ disk_critical_threshold * 100 }}%)."
NovaAggregateMemoryFullMajor:
if: >-
openstack_nova_aggregate_used_ram >= openstack_nova_aggregate_ram * {{ ram_major_threshold }}
labels:
severity: major
service: nova
annotations:
summary: "{{ ram_major_threshold * 100 }}% of aggregate RAM was used"
description: "{% raw %}{{ $value }}MB of RAM on the {{ $labels.aggregate }}{% endraw %} aggregate was used (at least {{ ram_major_threshold * 100 }}%)."
NovaAggregateMemoryFullCritical:
if: >-
openstack_nova_aggregate_used_ram >= openstack_nova_aggregate_ram * {{ ram_critical_threshold }}
labels:
severity: critical
service: nova
annotations:
summary: "{{ ram_critical_threshold * 100 }}% of aggregate RAM was used"
description: "{% raw %}{{ $value }}MB of RAM on the {{ $labels.aggregate }}{% endraw %} aggregate was used (at least {{ ram_critical_threshold * 100 }}%)."
NovaAggregateDiskFullMajor:
if: >-
openstack_nova_aggregate_used_disk >= openstack_nova_aggregate_disk * {{ disk_major_threshold }}
labels:
severity: major
service: nova
annotations:
summary: "{{ disk_major_threshold * 100 }}% of aggregate disk space was used"
description: "{% raw %}{{ $value }}GB of disk space on the {{ $labels.aggregate }}{% endraw %} aggregate was used (at least {{ disk_major_threshold * 100 }}%)."
NovaAggregateDiskFullCritical:
if: >-
openstack_nova_aggregate_used_disk >= openstack_nova_aggregate_disk * {{ disk_critical_threshold }}
labels:
severity: critical
service: nova
annotations:
summary: "{{ disk_critical_threshold * 100 }}% of aggregate disk space was used"
description: "{% raw %}{{ $value }}GB of disk space on the {{ $labels.aggregate }}{% endraw %} aggregate was used (at least {{ disk_critical_threshold * 100 }}%)."
NovaTotalVCPUsFullMinor:
if: >-
sum(label_replace(system_load15, "hostname", "$1", "host", "(.*)") and on (hostname) openstack_nova_vcpus) >= max(sum(openstack_nova_vcpus) by (instance)) * {{ cpu_minor_threshold }}
labels:
severity: minor
service: nova
annotations:
summary: "{{ cpu_minor_threshold * 100 }}% of cloud VCPUs were used"
description: "{% raw %}{{ $value }}{% endraw %} VCPUs in the cloud were used (at least {{ cpu_minor_threshold * 100 }}%)."
NovaTotalVCPUsFullMajor:
if: >-
sum(label_replace(system_load15, "hostname", "$1", "host", "(.*)") and on (hostname) openstack_nova_vcpus) >= max(sum(openstack_nova_vcpus) by (instance)) * {{ cpu_major_threshold }}
labels:
severity: major
service: nova
annotations:
summary: "{{ cpu_major_threshold * 100 }}% of cloud VCPUs were used"
description: "{% raw %}{{ $value }}{% endraw %} VCPUs in the cloud were used (at least {{ cpu_major_threshold * 100 }}%)."
NovaTotalMemoryFullMajor:
if: >-
openstack_nova_total_used_ram >= openstack_nova_total_ram * {{ ram_major_threshold }}
labels:
severity: major
service: nova
annotations:
summary: "{{ ram_major_threshold * 100 }}% of cloud RAM was used"
description: "{% raw %}{{ $value }}MB{% endraw %} of RAM in the cloud was used (at least {{ ram_major_threshold * 100 }}%)."
NovaTotalMemoryFullCritical:
if: >-
openstack_nova_total_used_ram >= openstack_nova_total_ram * {{ ram_critical_threshold }}
labels:
severity: critical
service: nova
annotations:
summary: "{{ ram_critical_threshold * 100 }}% of cloud RAM was used"
description: "{% raw %}{{ $value }}MB{% endraw %} of RAM in the cloud was used (at least {{ ram_critical_threshold * 100 }}%)."
NovaTotalDiskFullMajor:
if: >-
openstack_nova_total_used_disk >= openstack_nova_total_disk * {{ disk_major_threshold }}
labels:
severity: major
service: nova
annotations:
summary: "{{ disk_major_threshold * 100 }}% of cloud disk space was used"
description: "{% raw %}{{ $value }}GB{% endraw %} of disk space in the cloud was used (at least {{ disk_major_threshold * 100 }}%)."
NovaTotalDiskFullCritical:
if: >-
openstack_nova_total_used_disk >= openstack_nova_total_disk * {{ disk_critical_threshold }}
labels:
severity: critical
service: nova
annotations:
summary: "{{ disk_critical_threshold * 100 }}% of cloud disk space was used"
description: "{% raw %}{{ $value }}GB{% endraw %} of disk space in the cloud was used (at least {{ disk_critical_threshold * 100 }}%)."
{%- endif %}
NovaErrorLogsTooHigh:
{%- set log_threshold = monitoring.error_log_rate.warn|float %}
if: >-
sum(rate(log_messages{service="nova",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > {{ log_threshold }}
{%- raw %}
labels:
severity: warning
service: nova
annotations:
summary: "High number of errors in Nova logs"
description: "The average per-second rate of errors in Nova logs on the {{ $labels.host }} node is {{ $value }} (as measured over the last 5 minutes)."
{%- endraw %}
{%- if is_compute and exporters is defined %}
{%- raw %}
LibvirtDown:
if: >-
libvirt_up == 0
for: 2m
labels:
severity: critical
service: libvirt
annotations:
summary: "Failure to gather Libvirt metrics"
description: "The Libvirt metric exporter fails to gather metrics on the {{ $labels.host }} node for at least 2 minutes."
{%- endraw %}
{%- include "prometheus/_exporters_config.sls" %}
{%- endif %}
{%- endif %}