blob: cc483634de8572209a2ab4041624e829e910d3d0 [file] [log] [blame]
{% from "nova/map.jinja" import controller, compute with context %}
{%- set is_controller = controller.get('enabled', False) %}
{%- set is_compute = compute.get('enabled', False) %}
{%- if is_controller or is_compute %}
{%- if is_compute and exporters is defined and compute.get('compute_driver', 'libvirt.LibvirtDriver') == 'libvirt.LibvirtDriver' %}
{%- set packages = exporters.get('libvirt', {}).get('packages', ('libvirt-exporter', )) %}
{%- load_yaml as new_exporters_cfg %}
exporters:
libvirt:
enabled: true
{%- if packages is defined %}
packages:
{% for pkg in packages %}
- {{ pkg }}
{% endfor %}
{%- endif %}
services:
qemu:
enabled: true
bind:
address: 0.0.0.0
port: 9177
{%- endload %}
{{ new_exporters_cfg|yaml(False) }}
{%- endif %}
server:
recording:
total:openstack_nova_instance_status:
query: >-
count(openstack_nova_instance_status) without (id,name)
total:openstack_nova_instance_status:error:
query: >-
count(openstack_nova_instance_status == 2) without (id,name)
no_aggregate:openstack_nova_aggregate_metadata:
query: >-
label_replace(count(label_replace(openstack_nova_running_instances,"host","$1","hostname","(.*)") unless on(host) openstack_nova_aggregate_metadata) without(hostname),"name","_no_host_aggregate","",".*")
host_user:libvirt_domain_info_virtual_cpus:
query: >-
sum(libvirt_domain_info_virtual_cpus) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_info_virtual_cpus:
query: >-
sum(libvirt_domain_info_virtual_cpus) by (host,project_name,project_uuid)
libvirt_domain_info_cpu_time_seconds:rate5m:
query: >-
rate(libvirt_domain_info_cpu_time_seconds_total[5m])
host_user:libvirt_domain_info_cpu_time_seconds:rate5m:
query: >-
sum(libvirt_domain_info_cpu_time_seconds:rate5m) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_info_cpu_time_seconds:rate5m:
query: >-
sum(libvirt_domain_info_cpu_time_seconds:rate5m) by (host,project_name,project_uuid)
host_user:libvirt_domain_info_maximum_memory_bytes:
query: >-
sum(libvirt_domain_info_maximum_memory_bytes) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_info_maximum_memory_bytes:
query: >-
sum(libvirt_domain_info_maximum_memory_bytes) by (host,project_name,project_uuid)
host_user:libvirt_domain_info_memory_usage_bytes:
query: >-
sum(libvirt_domain_info_memory_usage_bytes) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_info_memory_usage_bytes:
query: >-
sum(libvirt_domain_info_memory_usage_bytes) by (host,project_name,project_uuid)
host_user:libvirt_domain_memory_unused_bytes:
query: >-
sum(libvirt_domain_memory_unused_bytes) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_memory_unused_bytes:
query: >-
sum(libvirt_domain_memory_unused_bytes) by (host,project_name,project_uuid)
host_user:libvirt_domain_memory_available_bytes:
query: >-
sum(libvirt_domain_memory_available_bytes) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_memory_available_bytes:
query: >-
sum(libvirt_domain_memory_available_bytes) by (host,project_name,project_uuid)
host_user:libvirt_domain_memory_actual_balloon_bytes:
query: >-
sum(libvirt_domain_memory_actual_balloon_bytes) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_memory_actual_balloon_bytes:
query: >-
sum(libvirt_domain_memory_actual_balloon_bytes) by (host,project_name,project_uuid)
host_user:libvirt_domain_memory_rss_bytes:
query: >-
sum(libvirt_domain_memory_rss_bytes) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_memory_rss_bytes:
query: >-
sum(libvirt_domain_memory_rss_bytes) by (host,project_name,project_uuid)
host_user:libvirt_domain_memory_usable_bytes:
query: >-
sum(libvirt_domain_memory_usable_bytes) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_memory_usable_bytes:
query: >-
sum(libvirt_domain_memory_usable_bytes) by (host,project_name,project_uuid)
libvirt_domain_block_stats_read_bytes:rate5m:
query: >-
rate(libvirt_domain_block_stats_read_bytes_total[5m])
instance:libvirt_domain_block_stats_read_bytes:rate5m:
query: >-
sum(libvirt_domain_block_stats_read_bytes:rate5m) without (source_file,target_device)
host_user:libvirt_domain_block_stats_read_bytes:rate5m:
query: >-
sum(libvirt_domain_block_stats_read_bytes:rate5m) by (host,project_name,project_uuid,type,user_name,user_uuid)
host_project:libvirt_domain_block_stats_read_bytes:rate5m:
query: >-
sum(libvirt_domain_block_stats_read_bytes:rate5m) by (host,project_name,project_uuid,type)
libvirt_domain_block_stats_read_requests:rate5m:
query: >-
rate(libvirt_domain_block_stats_read_requests_total[5m])
instance:libvirt_domain_block_stats_read_requests:rate5m:
query: >-
sum(libvirt_domain_block_stats_read_requests:rate5m) without (source_file,target_device)
host_user:libvirt_domain_block_stats_read_requests:rate5m:
query: >-
sum(libvirt_domain_block_stats_read_requests:rate5m) by (host,project_name,project_uuid,type,user_name,user_uuid)
host_project:libvirt_domain_block_stats_read_requests:rate5m:
query: >-
sum(libvirt_domain_block_stats_read_requests:rate5m) by (host,project_name,project_uuid,type)
libvirt_domain_block_stats_write_bytes:rate5m:
query: >-
rate(libvirt_domain_block_stats_write_bytes_total[5m])
instance:libvirt_domain_block_stats_write_bytes:rate5m:
query: >-
sum(libvirt_domain_block_stats_write_bytes:rate5m) without (source_file,target_device)
host_user:libvirt_domain_block_stats_write_bytes:rate5m:
query: >-
sum(libvirt_domain_block_stats_write_bytes:rate5m) by (host,project_name,project_uuid,type,user_name,user_uuid)
host_project:libvirt_domain_block_stats_write_bytes:rate5m:
query: >-
sum(libvirt_domain_block_stats_write_bytes:rate5m) by (host,project_name,project_uuid,type)
libvirt_domain_block_stats_write_requests:rate5m:
query: >-
rate(libvirt_domain_block_stats_write_requests_total[5m])
instance:libvirt_domain_block_stats_write_requests:rate5m:
query: >-
sum(libvirt_domain_block_stats_write_requests:rate5m) without (source_file,target_device)
host_user:libvirt_domain_block_stats_write_requests:rate5m:
query: >-
sum(libvirt_domain_block_stats_write_requests:rate5m) by (host,project_name,project_uuid,type,user_name,user_uuid)
host_project:libvirt_domain_block_stats_write_requests:rate5m:
query: >-
sum(libvirt_domain_block_stats_write_requests:rate5m) by (host,project_name,project_uuid,type)
instance:libvirt_domain_block_stats_allocation:
query: >-
sum(libvirt_domain_block_stats_allocation) without (source_file,target_device)
host_user:libvirt_domain_block_stats_allocation:
query: >-
sum(libvirt_domain_block_stats_allocation) by (host,project_name,project_uuid,type,user_name,user_uuid)
host_project:libvirt_domain_block_stats_allocation:
query: >-
sum(libvirt_domain_block_stats_allocation) by (host,project_name,project_uuid,type)
instance:libvirt_domain_block_stats_capacity:
query: >-
sum(libvirt_domain_block_stats_capacity) without (source_file,target_device)
host_user:libvirt_domain_block_stats_capacity:
query: >-
sum(libvirt_domain_block_stats_capacity) by (host,project_name,project_uuid,type,user_name,user_uuid)
host_project:libvirt_domain_block_stats_capacity:
query: >-
sum(libvirt_domain_block_stats_capacity) by (host,project_name,project_uuid,type)
instance:libvirt_domain_block_stats_physical:
query: >-
sum(libvirt_domain_block_stats_physical) without (source_file,target_device)
host_user:libvirt_domain_block_stats_physical:
query: >-
sum(libvirt_domain_block_stats_physical) by (host,project_name,project_uuid,type,user_name,user_uuid)
host_project:libvirt_domain_block_stats_physical:
query: >-
sum(libvirt_domain_block_stats_physical) by (host,project_name,project_uuid,type)
libvirt_domain_interface_stats_receive_drops:rate5m:
query: >-
rate(libvirt_domain_interface_stats_receive_drops_total[5m])
instance:libvirt_domain_interface_stats_receive_drops:rate5m:
query: >-
sum(libvirt_domain_interface_stats_receive_drops:rate5m) without (source_bridge,target_device)
host_user:libvirt_domain_interface_stats_receive_drops:rate5m:
query: >-
sum(libvirt_domain_interface_stats_receive_drops:rate5m) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_interface_stats_receive_drops:rate5m:
query: >-
sum(libvirt_domain_interface_stats_receive_drops:rate5m) by (host,project_name,project_uuid)
libvirt_domain_interface_stats_receive_errors:rate5m:
query: >-
rate(libvirt_domain_interface_stats_receive_errors_total[5m])
instance:libvirt_domain_interface_stats_receive_errors:rate5m:
query: >-
sum(libvirt_domain_interface_stats_receive_errors:rate5m) without (source_bridge,target_device)
host_user:libvirt_domain_interface_stats_receive_errors:rate5m:
query: >-
sum(libvirt_domain_interface_stats_receive_errors:rate5m) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_interface_stats_receive_errors:rate5m:
query: >-
sum(libvirt_domain_interface_stats_receive_errors:rate5m) by (host,project_name,project_uuid)
libvirt_domain_interface_stats_receive_packets:rate5m:
query: >-
rate(libvirt_domain_interface_stats_receive_packets_total[5m])
instance:libvirt_domain_interface_stats_receive_packets:rate5m:
query: >-
sum(libvirt_domain_interface_stats_receive_packets:rate5m) without (source_bridge,target_device)
host_user:libvirt_domain_interface_stats_receive_packets:rate5m:
query: >-
sum(libvirt_domain_interface_stats_receive_packets:rate5m) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_interface_stats_receive_packets:rate5m:
query: >-
sum(libvirt_domain_interface_stats_receive_packets:rate5m) by (host,project_name,project_uuid)
libvirt_domain_interface_stats_transmit_bytes:rate5m:
query: >-
rate(libvirt_domain_interface_stats_transmit_bytes_total[5m])
instance:libvirt_domain_interface_stats_transmit_bytes:rate5m:
query: >-
sum(libvirt_domain_interface_stats_transmit_bytes:rate5m) without (source_bridge,target_device)
host_user:libvirt_domain_interface_stats_transmit_bytes:rate5m:
query: >-
sum(libvirt_domain_interface_stats_transmit_bytes:rate5m) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_interface_stats_transmit_bytes:rate5m:
query: >-
sum(libvirt_domain_interface_stats_transmit_bytes:rate5m) by (host,project_name,project_uuid)
libvirt_domain_interface_stats_transmit_drops:rate5m:
query: >-
rate(libvirt_domain_interface_stats_transmit_drops_total[5m])
instance:libvirt_domain_interface_stats_transmit_drops:rate5m:
query: >-
sum(libvirt_domain_interface_stats_transmit_drops:rate5m) without (source_bridge,target_device)
host_user:libvirt_domain_interface_stats_transmit_drops:rate5m:
query: >-
sum(libvirt_domain_interface_stats_transmit_drops:rate5m) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_interface_stats_transmit_drops:rate5m:
query: >-
sum(libvirt_domain_interface_stats_transmit_drops:rate5m) by (host,project_name,project_uuid)
libvirt_domain_interface_stats_transmit_errors:rate5m:
query: >-
rate(libvirt_domain_interface_stats_transmit_errors_total[5m])
instance:libvirt_domain_interface_stats_transmit_errors:rate5m:
query: >-
sum(libvirt_domain_interface_stats_transmit_errors:rate5m) without (source_bridge,target_device)
host_user:libvirt_domain_interface_stats_transmit_errors:rate5m:
query: >-
sum(libvirt_domain_interface_stats_transmit_errors:rate5m) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_interface_stats_transmit_errors:rate5m:
query: >-
sum(libvirt_domain_interface_stats_transmit_errors:rate5m) by (host,project_name,project_uuid)
libvirt_domain_interface_stats_transmit_packets:rate5m:
query: >-
rate(libvirt_domain_interface_stats_transmit_packets_total[5m])
instance:libvirt_domain_interface_stats_transmit_packets:rate5m:
query: >-
sum(libvirt_domain_interface_stats_transmit_packets:rate5m) without (source_bridge,target_device)
host_user:libvirt_domain_interface_stats_transmit_packets:rate5m:
query: >-
sum(libvirt_domain_interface_stats_transmit_packets:rate5m) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_interface_stats_transmit_packets:rate5m:
query: >-
sum(libvirt_domain_interface_stats_transmit_packets:rate5m) by (host,project_name,project_uuid)
libvirt_domain_interface_stats_receive_bytes:rate5m:
query: >-
rate(libvirt_domain_interface_stats_receive_bytes_total[5m])
instance:libvirt_domain_interface_stats_receive_bytes:rate5m:
query: >-
sum(libvirt_domain_interface_stats_receive_bytes:rate5m) without (source_bridge,target_device)
host_user:libvirt_domain_interface_stats_receive_bytes:rate5m:
query: >-
sum(libvirt_domain_interface_stats_receive_bytes:rate5m) by (host,project_name,project_uuid,user_name,user_uuid)
host_project:libvirt_domain_interface_stats_receive_bytes:rate5m:
query: >-
sum(libvirt_domain_interface_stats_receive_bytes:rate5m) by (host,project_name,project_uuid)
alert:
{%- if is_controller %}
{% raw %}
NovaApiOutage:
if: >-
max(openstack_api_check_status{name=~"nova.*|placement"}) == 0
labels:
severity: critical
service: nova
annotations:
summary: "Nova API outage"
description: >-
Nova API is not accessible for all available Nova endpoints in the OpenStack service catalog.
NovaApiDown:
if: >-
openstack_api_check_status{name=~"nova.*|placement"} == 0
labels:
severity: major
service: nova
annotations:
summary: "{{ $labels.name }} endpoint is not accessible"
description: >-
Nova API is not accessible for the {{ $labels.name }} endpoint.
NovaApiEndpointDown:
if: >-
http_response_status{name=~"nova-api"} == 0
for: 2m
labels:
severity: minor
service: nova
annotations:
summary: "nova-api endpoint is not accessible"
description: >-
The nova-api endpoint on the {{ $labels.host }} node is not accessible for 2 minutes.
NovaApiEndpointsDownMajor:
if: >-
count(http_response_status{name=~"nova-api"} == 0) >= count(http_response_status{name=~"nova-api"}) * 0.6
for: 2m
labels:
severity: major
service: nova
annotations:
summary: "60% of nova-api endpoints are not accessible"
description: >-
More than 60% of nova-api endpoints are not accessible for 2 minutes.
NovaApiEndpointsOutage:
if: >-
count(http_response_status{name=~"nova-api"} == 0) == count(http_response_status{name=~"nova-api"})
for: 2m
labels:
severity: critical
service: nova
annotations:
summary: "nova-api endpoints outage"
description: >-
All available nova-api endpoints are not accessible for 2 minutes.
NovaServiceDown:
if: >-
openstack_nova_service_state == 0
labels:
severity: minor
service: nova
annotations:
summary: "{{ $labels.binary }} service is down"
description: >-
The {{ $labels.binary }} service on the {{ $labels.hostname }} node is down.
NovaServicesDownMinor:
if: >-
count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) >= on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * 0.3
labels:
severity: minor
service: nova
annotations:
summary: "30% of {{ $labels.binary }} services are down"
description: >-
More than 30% {{ $labels.binary }} services are down.
NovaComputeServicesDownMinor:
if: >-
count(openstack_nova_service_state{binary="nova-compute"} == 0) >= count(openstack_nova_service_state{binary="nova-compute"}) * 0.25
labels:
severity: minor
service: nova
annotations:
summary: "More than 25% of nova-compute services are down"
description: >-
More than 25% of nova-compute services are down.
NovaServicesDownMajor:
if: >-
count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) >= on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * 0.6
labels:
severity: major
service: nova
annotations:
summary: "More than 60% of {{ $labels.binary }} services are down"
description: >-
More than 60% of {{ $labels.binary }} services are down.
NovaComputeServicesDownMajor:
if: >-
count(openstack_nova_service_state{binary="nova-compute"} == 0) >= count(openstack_nova_service_state{binary="nova-compute"}) * 0.5
labels:
severity: major
service: nova
annotations:
summary: "More than 50% of nova-compute services are down"
description: >-
More than 50% of nova-compute services are down.
NovaServiceOutage:
if: >-
count(openstack_nova_service_state == 0) by (binary) == on (binary) count(openstack_nova_service_state) by (binary)
labels:
severity: critical
service: nova
annotations:
summary: "{{ $labels.binary }} service outage"
description: >-
All {{ $labels.binary }} services are down.
{%- endraw %}
{%- endif %}
{%- raw %}
NovaErrorLogsTooHigh:
if: >-
sum(rate(log_messages{service="nova",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.2
labels:
severity: warning
service: nova
annotations:
summary: "High number of errors in Nova logs"
description: "The average rate of errors in Nova logs on the {{ $labels.host }} node is more than 0.2 error messages per second (as measured over the last 5 minutes)."
{%- endraw %}
{%- if is_compute and exporters is defined and compute.get('compute_driver', 'libvirt.LibvirtDriver') == 'libvirt.LibvirtDriver'%}
{%- raw %}
LibvirtDown:
if: >-
libvirt_up == 0
for: 2m
labels:
severity: critical
service: libvirt
annotations:
summary: "Failure to gather Libvirt metrics"
description: "The Libvirt metric exporter fails to gather metrics on the {{ $labels.host }} node for 2 minutes."
{%- endraw %}
{%- include "prometheus/_exporters_config.sls" %}
{%- endif %}
{%- endif %}