| {% from "nova/map.jinja" import controller, compute with context %} |
| |
| {%- set is_controller = controller.get('enabled', False) %} |
| {%- set is_compute = compute.get('enabled', False) %} |
| |
| {%- if is_controller or is_compute %} |
| {%- if is_compute and exporters is defined and compute.get('compute_driver', 'libvirt.LibvirtDriver') == 'libvirt.LibvirtDriver' %} |
| {%- set packages = exporters.get('libvirt', {}).get('packages', ('libvirt-exporter', )) %} |
| {%- load_yaml as new_exporters_cfg %} |
| exporters: |
| libvirt: |
| enabled: true |
| {%- if packages is defined %} |
| packages: |
| {% for pkg in packages %} |
| - {{ pkg }} |
| {% endfor %} |
| {%- endif %} |
| services: |
| qemu: |
| enabled: true |
| bind: |
| address: 0.0.0.0 |
| port: 9177 |
| {%- endload %} |
| {{ new_exporters_cfg|yaml(False) }} |
| {%- endif %} |
| |
| server: |
| recording: |
| total:openstack_nova_instance_status: |
| query: >- |
| count(openstack_nova_instance_status) without (id,name) |
| total:openstack_nova_instance_status:error: |
| query: >- |
| count(openstack_nova_instance_status == 2) without (id,name) |
| |
| no_aggregate:openstack_nova_aggregate_metadata: |
| query: >- |
| label_replace(count(label_replace(openstack_nova_running_instances,"host","$1","hostname","(.*)") unless on(host) openstack_nova_aggregate_metadata) without(hostname),"name","_no_host_aggregate","",".*") |
| |
| host_user:libvirt_domain_info_virtual_cpus: |
| query: >- |
| sum(libvirt_domain_info_virtual_cpus) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_info_virtual_cpus: |
| query: >- |
| sum(libvirt_domain_info_virtual_cpus) by (host,project_name,project_uuid) |
| libvirt_domain_info_cpu_time_seconds:rate5m: |
| query: >- |
| rate(libvirt_domain_info_cpu_time_seconds_total[5m]) |
| host_user:libvirt_domain_info_cpu_time_seconds:rate5m: |
| query: >- |
| sum(libvirt_domain_info_cpu_time_seconds:rate5m) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_info_cpu_time_seconds:rate5m: |
| query: >- |
| sum(libvirt_domain_info_cpu_time_seconds:rate5m) by (host,project_name,project_uuid) |
| |
| host_user:libvirt_domain_info_maximum_memory_bytes: |
| query: >- |
| sum(libvirt_domain_info_maximum_memory_bytes) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_info_maximum_memory_bytes: |
| query: >- |
| sum(libvirt_domain_info_maximum_memory_bytes) by (host,project_name,project_uuid) |
| host_user:libvirt_domain_info_memory_usage_bytes: |
| query: >- |
| sum(libvirt_domain_info_memory_usage_bytes) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_info_memory_usage_bytes: |
| query: >- |
| sum(libvirt_domain_info_memory_usage_bytes) by (host,project_name,project_uuid) |
| host_user:libvirt_domain_memory_unused_bytes: |
| query: >- |
| sum(libvirt_domain_memory_unused_bytes) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_memory_unused_bytes: |
| query: >- |
| sum(libvirt_domain_memory_unused_bytes) by (host,project_name,project_uuid) |
| host_user:libvirt_domain_memory_available_bytes: |
| query: >- |
| sum(libvirt_domain_memory_available_bytes) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_memory_available_bytes: |
| query: >- |
| sum(libvirt_domain_memory_available_bytes) by (host,project_name,project_uuid) |
| host_user:libvirt_domain_memory_actual_balloon_bytes: |
| query: >- |
| sum(libvirt_domain_memory_actual_balloon_bytes) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_memory_actual_balloon_bytes: |
| query: >- |
| sum(libvirt_domain_memory_actual_balloon_bytes) by (host,project_name,project_uuid) |
| host_user:libvirt_domain_memory_rss_bytes: |
| query: >- |
| sum(libvirt_domain_memory_rss_bytes) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_memory_rss_bytes: |
| query: >- |
| sum(libvirt_domain_memory_rss_bytes) by (host,project_name,project_uuid) |
| host_user:libvirt_domain_memory_usable_bytes: |
| query: >- |
| sum(libvirt_domain_memory_usable_bytes) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_memory_usable_bytes: |
| query: >- |
| sum(libvirt_domain_memory_usable_bytes) by (host,project_name,project_uuid) |
| |
| libvirt_domain_block_stats_read_bytes:rate5m: |
| query: >- |
| rate(libvirt_domain_block_stats_read_bytes_total[5m]) |
| instance:libvirt_domain_block_stats_read_bytes:rate5m: |
| query: >- |
| sum(libvirt_domain_block_stats_read_bytes:rate5m) without (source_file,target_device) |
| host_user:libvirt_domain_block_stats_read_bytes:rate5m: |
| query: >- |
| sum(libvirt_domain_block_stats_read_bytes:rate5m) by (host,project_name,project_uuid,type,user_name,user_uuid) |
| host_project:libvirt_domain_block_stats_read_bytes:rate5m: |
| query: >- |
| sum(libvirt_domain_block_stats_read_bytes:rate5m) by (host,project_name,project_uuid,type) |
| libvirt_domain_block_stats_read_requests:rate5m: |
| query: >- |
| rate(libvirt_domain_block_stats_read_requests_total[5m]) |
| instance:libvirt_domain_block_stats_read_requests:rate5m: |
| query: >- |
| sum(libvirt_domain_block_stats_read_requests:rate5m) without (source_file,target_device) |
| host_user:libvirt_domain_block_stats_read_requests:rate5m: |
| query: >- |
| sum(libvirt_domain_block_stats_read_requests:rate5m) by (host,project_name,project_uuid,type,user_name,user_uuid) |
| host_project:libvirt_domain_block_stats_read_requests:rate5m: |
| query: >- |
| sum(libvirt_domain_block_stats_read_requests:rate5m) by (host,project_name,project_uuid,type) |
| libvirt_domain_block_stats_write_bytes:rate5m: |
| query: >- |
| rate(libvirt_domain_block_stats_write_bytes_total[5m]) |
| instance:libvirt_domain_block_stats_write_bytes:rate5m: |
| query: >- |
| sum(libvirt_domain_block_stats_write_bytes:rate5m) without (source_file,target_device) |
| host_user:libvirt_domain_block_stats_write_bytes:rate5m: |
| query: >- |
| sum(libvirt_domain_block_stats_write_bytes:rate5m) by (host,project_name,project_uuid,type,user_name,user_uuid) |
| host_project:libvirt_domain_block_stats_write_bytes:rate5m: |
| query: >- |
| sum(libvirt_domain_block_stats_write_bytes:rate5m) by (host,project_name,project_uuid,type) |
| libvirt_domain_block_stats_write_requests:rate5m: |
| query: >- |
| rate(libvirt_domain_block_stats_write_requests_total[5m]) |
| instance:libvirt_domain_block_stats_write_requests:rate5m: |
| query: >- |
| sum(libvirt_domain_block_stats_write_requests:rate5m) without (source_file,target_device) |
| host_user:libvirt_domain_block_stats_write_requests:rate5m: |
| query: >- |
| sum(libvirt_domain_block_stats_write_requests:rate5m) by (host,project_name,project_uuid,type,user_name,user_uuid) |
| host_project:libvirt_domain_block_stats_write_requests:rate5m: |
| query: >- |
| sum(libvirt_domain_block_stats_write_requests:rate5m) by (host,project_name,project_uuid,type) |
| instance:libvirt_domain_block_stats_allocation: |
| query: >- |
| sum(libvirt_domain_block_stats_allocation) without (source_file,target_device) |
| host_user:libvirt_domain_block_stats_allocation: |
| query: >- |
| sum(libvirt_domain_block_stats_allocation) by (host,project_name,project_uuid,type,user_name,user_uuid) |
| host_project:libvirt_domain_block_stats_allocation: |
| query: >- |
| sum(libvirt_domain_block_stats_allocation) by (host,project_name,project_uuid,type) |
| instance:libvirt_domain_block_stats_capacity: |
| query: >- |
| sum(libvirt_domain_block_stats_capacity) without (source_file,target_device) |
| host_user:libvirt_domain_block_stats_capacity: |
| query: >- |
| sum(libvirt_domain_block_stats_capacity) by (host,project_name,project_uuid,type,user_name,user_uuid) |
| host_project:libvirt_domain_block_stats_capacity: |
| query: >- |
| sum(libvirt_domain_block_stats_capacity) by (host,project_name,project_uuid,type) |
| instance:libvirt_domain_block_stats_physical: |
| query: >- |
| sum(libvirt_domain_block_stats_physical) without (source_file,target_device) |
| host_user:libvirt_domain_block_stats_physical: |
| query: >- |
| sum(libvirt_domain_block_stats_physical) by (host,project_name,project_uuid,type,user_name,user_uuid) |
| host_project:libvirt_domain_block_stats_physical: |
| query: >- |
| sum(libvirt_domain_block_stats_physical) by (host,project_name,project_uuid,type) |
| |
| libvirt_domain_interface_stats_receive_drops:rate5m: |
| query: >- |
| rate(libvirt_domain_interface_stats_receive_drops_total[5m]) |
| instance:libvirt_domain_interface_stats_receive_drops:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_receive_drops:rate5m) without (source_bridge,target_device) |
| host_user:libvirt_domain_interface_stats_receive_drops:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_receive_drops:rate5m) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_interface_stats_receive_drops:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_receive_drops:rate5m) by (host,project_name,project_uuid) |
| libvirt_domain_interface_stats_receive_errors:rate5m: |
| query: >- |
| rate(libvirt_domain_interface_stats_receive_errors_total[5m]) |
| instance:libvirt_domain_interface_stats_receive_errors:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_receive_errors:rate5m) without (source_bridge,target_device) |
| host_user:libvirt_domain_interface_stats_receive_errors:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_receive_errors:rate5m) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_interface_stats_receive_errors:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_receive_errors:rate5m) by (host,project_name,project_uuid) |
| libvirt_domain_interface_stats_receive_packets:rate5m: |
| query: >- |
| rate(libvirt_domain_interface_stats_receive_packets_total[5m]) |
| instance:libvirt_domain_interface_stats_receive_packets:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_receive_packets:rate5m) without (source_bridge,target_device) |
| host_user:libvirt_domain_interface_stats_receive_packets:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_receive_packets:rate5m) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_interface_stats_receive_packets:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_receive_packets:rate5m) by (host,project_name,project_uuid) |
| libvirt_domain_interface_stats_transmit_bytes:rate5m: |
| query: >- |
| rate(libvirt_domain_interface_stats_transmit_bytes_total[5m]) |
| instance:libvirt_domain_interface_stats_transmit_bytes:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_transmit_bytes:rate5m) without (source_bridge,target_device) |
| host_user:libvirt_domain_interface_stats_transmit_bytes:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_transmit_bytes:rate5m) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_interface_stats_transmit_bytes:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_transmit_bytes:rate5m) by (host,project_name,project_uuid) |
| libvirt_domain_interface_stats_transmit_drops:rate5m: |
| query: >- |
| rate(libvirt_domain_interface_stats_transmit_drops_total[5m]) |
| instance:libvirt_domain_interface_stats_transmit_drops:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_transmit_drops:rate5m) without (source_bridge,target_device) |
| host_user:libvirt_domain_interface_stats_transmit_drops:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_transmit_drops:rate5m) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_interface_stats_transmit_drops:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_transmit_drops:rate5m) by (host,project_name,project_uuid) |
| libvirt_domain_interface_stats_transmit_errors:rate5m: |
| query: >- |
| rate(libvirt_domain_interface_stats_transmit_errors_total[5m]) |
| instance:libvirt_domain_interface_stats_transmit_errors:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_transmit_errors:rate5m) without (source_bridge,target_device) |
| host_user:libvirt_domain_interface_stats_transmit_errors:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_transmit_errors:rate5m) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_interface_stats_transmit_errors:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_transmit_errors:rate5m) by (host,project_name,project_uuid) |
| libvirt_domain_interface_stats_transmit_packets:rate5m: |
| query: >- |
| rate(libvirt_domain_interface_stats_transmit_packets_total[5m]) |
| instance:libvirt_domain_interface_stats_transmit_packets:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_transmit_packets:rate5m) without (source_bridge,target_device) |
| host_user:libvirt_domain_interface_stats_transmit_packets:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_transmit_packets:rate5m) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_interface_stats_transmit_packets:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_transmit_packets:rate5m) by (host,project_name,project_uuid) |
| libvirt_domain_interface_stats_receive_bytes:rate5m: |
| query: >- |
| rate(libvirt_domain_interface_stats_receive_bytes_total[5m]) |
| instance:libvirt_domain_interface_stats_receive_bytes:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_receive_bytes:rate5m) without (source_bridge,target_device) |
| host_user:libvirt_domain_interface_stats_receive_bytes:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_receive_bytes:rate5m) by (host,project_name,project_uuid,user_name,user_uuid) |
| host_project:libvirt_domain_interface_stats_receive_bytes:rate5m: |
| query: >- |
| sum(libvirt_domain_interface_stats_receive_bytes:rate5m) by (host,project_name,project_uuid) |
| |
| alert: |
| {%- if is_controller %} |
| {% raw %} |
| NovaApiOutage: |
| if: >- |
| max(openstack_api_check_status{name=~"nova.*|placement"}) == 0 |
| labels: |
| severity: critical |
| service: nova |
| annotations: |
| summary: "Nova API outage" |
| description: >- |
| Nova API is not accessible for all available Nova endpoints in the OpenStack service catalog. |
| NovaApiDown: |
| if: >- |
| openstack_api_check_status{name=~"nova.*|placement"} == 0 |
| labels: |
| severity: major |
| service: nova |
| annotations: |
| summary: "{{ $labels.name }} endpoint is not accessible" |
| description: >- |
| Nova API is not accessible for the {{ $labels.name }} endpoint. |
| NovaApiEndpointDown: |
| if: >- |
| http_response_status{name=~"nova-api"} == 0 |
| for: 2m |
| labels: |
| severity: minor |
| service: nova |
| annotations: |
| summary: "nova-api endpoint is not accessible" |
| description: >- |
| The nova-api endpoint on the {{ $labels.host }} node is not accessible for 2 minutes. |
| NovaApiEndpointsDownMajor: |
| if: >- |
| count(http_response_status{name=~"nova-api"} == 0) >= count(http_response_status{name=~"nova-api"}) * 0.6 |
| for: 2m |
| labels: |
| severity: major |
| service: nova |
| annotations: |
| summary: "60% of nova-api endpoints are not accessible" |
| description: >- |
| More than 60% of nova-api endpoints are not accessible for 2 minutes. |
| NovaApiEndpointsOutage: |
| if: >- |
| count(http_response_status{name=~"nova-api"} == 0) == count(http_response_status{name=~"nova-api"}) |
| for: 2m |
| labels: |
| severity: critical |
| service: nova |
| annotations: |
| summary: "nova-api endpoints outage" |
| description: >- |
| All available nova-api endpoints are not accessible for 2 minutes. |
| NovaServiceDown: |
| if: >- |
| openstack_nova_service_state == 0 |
| labels: |
| severity: minor |
| service: nova |
| annotations: |
| summary: "{{ $labels.binary }} service is down" |
| description: >- |
| The {{ $labels.binary }} service on the {{ $labels.hostname }} node is down. |
| NovaServicesDownMinor: |
| if: >- |
| count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) >= on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * 0.3 |
| labels: |
| severity: minor |
| service: nova |
| annotations: |
| summary: "30% of {{ $labels.binary }} services are down" |
| description: >- |
| More than 30% {{ $labels.binary }} services are down. |
| NovaComputeServicesDownMinor: |
| if: >- |
| count(openstack_nova_service_state{binary="nova-compute"} == 0) >= count(openstack_nova_service_state{binary="nova-compute"}) * 0.25 |
| labels: |
| severity: minor |
| service: nova |
| annotations: |
| summary: "More than 25% of nova-compute services are down" |
| description: >- |
| More than 25% of nova-compute services are down. |
| NovaServicesDownMajor: |
| if: >- |
| count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) >= on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * 0.6 |
| labels: |
| severity: major |
| service: nova |
| annotations: |
| summary: "More than 60% of {{ $labels.binary }} services are down" |
| description: >- |
| More than 60% of {{ $labels.binary }} services are down. |
| NovaComputeServicesDownMajor: |
| if: >- |
| count(openstack_nova_service_state{binary="nova-compute"} == 0) >= count(openstack_nova_service_state{binary="nova-compute"}) * 0.5 |
| labels: |
| severity: major |
| service: nova |
| annotations: |
| summary: "More than 50% of nova-compute services are down" |
| description: >- |
| More than 50% of nova-compute services are down. |
| NovaServiceOutage: |
| if: >- |
| count(openstack_nova_service_state == 0) by (binary) == on (binary) count(openstack_nova_service_state) by (binary) |
| labels: |
| severity: critical |
| service: nova |
| annotations: |
| summary: "{{ $labels.binary }} service outage" |
| description: >- |
| All {{ $labels.binary }} services are down. |
| {%- endraw %} |
| {%- endif %} |
| {%- raw %} |
| NovaErrorLogsTooHigh: |
| if: >- |
| sum(rate(log_messages{service="nova",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.2 |
| labels: |
| severity: warning |
| service: nova |
| annotations: |
| summary: "High number of errors in Nova logs" |
| description: "The average rate of errors in Nova logs on the {{ $labels.host }} node is more than 0.2 error messages per second (as measured over the last 5 minutes)." |
| {%- endraw %} |
| {%- if is_compute and exporters is defined and compute.get('compute_driver', 'libvirt.LibvirtDriver') == 'libvirt.LibvirtDriver'%} |
| {%- raw %} |
| LibvirtDown: |
| if: >- |
| libvirt_up == 0 |
| for: 2m |
| labels: |
| severity: critical |
| service: libvirt |
| annotations: |
| summary: "Failure to gather Libvirt metrics" |
| description: "The Libvirt metric exporter fails to gather metrics on the {{ $labels.host }} node for 2 minutes." |
| {%- endraw %} |
| {%- include "prometheus/_exporters_config.sls" %} |
| {%- endif %} |
| {%- endif %} |