| {% from "nova/map.jinja" import controller, compute, monitoring with context %} |
| |
| {%- set is_controller = controller.get('enabled', False) %} |
| {%- set is_compute = compute.get('enabled', False) %} |
| |
| {%- if is_controller or is_compute %} |
| {%- if is_compute and |
| exporters is defined %} |
| {%- set packages = exporters.get('libvirt', {}).get('packages', ('libvirt-exporter', )) %} |
| {%- load_yaml as new_exporters_cfg %} |
| exporters: |
| libvirt: |
| enabled: true |
| {%- if packages is defined %} |
| packages: |
| {% for pkg in packages %} |
| - {{ pkg }} |
| {% endfor %} |
| {%- endif %} |
| services: |
| qemu: |
| enabled: true |
| bind: |
| address: 0.0.0.0 |
| port: 9177 |
| {%- endload %} |
| {{ new_exporters_cfg|yaml(False) }} |
| {%- endif %} |
| |
| server: |
| alert: |
| {%- if is_controller %} |
| {%- set minor_threshold = monitoring.services_failed_warning_threshold_percent|float %} |
| {%- set major_threshold = monitoring.services_failed_critical_threshold_percent|float %} |
| {%- set minor_compute_threshold = monitoring.computes_failed_warning_threshold_percent|float %} |
| {%- set major_compute_threshold = monitoring.computes_failed_critical_threshold_percent|float %} |
| {% raw %} |
| NovaAPIOutage: |
| if: >- |
| max(openstack_api_check_status{name=~"nova.*|placement"}) == 0 |
| labels: |
| severity: critical |
| service: nova |
| annotations: |
| summary: "Nova API outage" |
| description: >- |
| Nova API is not accessible for all available Nova endpoints in the OpenStack service catalog. |
| NovaAPIDown: |
| if: >- |
| openstack_api_check_status{name=~"nova.*|placement"} == 0 |
| labels: |
| severity: major |
| service: nova |
| annotations: |
| summary: "{{ $labels.name }} endpoint is not accessible" |
| description: >- |
| Nova API is not accessible for the {{ $labels.name }} endpoint. |
| NovaAPIServiceDown: |
| if: >- |
| http_response_status{name=~"nova-api"} == 0 |
| for: 2m |
| labels: |
| severity: minor |
| service: nova |
| annotations: |
| summary: "Host nova-api endpoint is not accessible" |
| description: >- |
| The host nova-api endpoint on the {{ $labels.host }} node is not accessible for at least 2 minutes. |
| NovaServiceDown: |
| if: >- |
| openstack_nova_service_state == 0 |
| labels: |
| severity: minor |
| service: nova |
| annotations: |
| summary: "{{ $labels.binary }} service is down" |
| description: >- |
| The {{ $labels.binary }} service on the {{ $labels.hostname }} node is down. |
| {%- endraw %} |
| NovaServicesDownMinor: |
| if: >- |
| count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) >= on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * {{minor_threshold}} and count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) < on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * {{major_threshold}} |
| labels: |
| severity: minor |
| service: nova |
| annotations: |
| summary: "{{minor_threshold * 100}}%{%- raw %} of {{ $labels.binary }} services are down" |
| description: >- |
| {{ $value }} {{ $labels.binary }} services are down {%- endraw %}(at least {{minor_threshold * 100}}%). |
| NovaComputeServicesDownMinor: |
| if: >- |
| count(openstack_nova_service_state{binary="nova-compute"} == 0) >= count(openstack_nova_service_state{binary="nova-compute"}) * {{minor_compute_threshold}} and count(openstack_nova_service_state{binary="nova-compute"} == 0) < count(openstack_nova_service_state{binary="nova-compute"}) * {{major_compute_threshold}} |
| labels: |
| severity: minor |
| service: nova |
| annotations: |
| summary: "{{minor_compute_threshold * 100}}%{%- raw %} of nova-compute services are down" |
| description: >- |
| {{ $value }} nova-compute services are down {%- endraw %}(at least {{minor_compute_threshold * 100}}%). |
| NovaServicesDownMajor: |
| if: >- |
| count(openstack_nova_service_state{binary!~"nova-compute"} == 0) by (binary) >= on (binary) count(openstack_nova_service_state{binary!~"nova-compute"}) by (binary) * {{major_threshold}} |
| labels: |
| severity: major |
| service: nova |
| annotations: |
| summary: "{{major_threshold * 100}}%{%- raw %} of {{ $labels.binary }} services are down" |
| description: >- |
| {{ $value }} {{ $labels.binary }} services are down {%- endraw %}(at least {{major_threshold * 100}}%). |
| NovaComputeServicesDownMajor: |
| if: >- |
| count(openstack_nova_service_state{binary="nova-compute"} == 0) >= count(openstack_nova_service_state{binary="nova-compute"}) * {{major_compute_threshold}} |
| labels: |
| severity: major |
| service: nova |
| annotations: |
| summary: "{{major_compute_threshold * 100}}%{%- raw %} of nova-compute services are down" |
| description: >- |
| {{ $value }} nova-compute services are down {%- endraw %}(at least {{major_compute_threshold * 100}}%).{%- raw %} |
| NovaServiceOutage: |
| if: >- |
| count(openstack_nova_service_state == 0) by (binary) == on (binary) count(openstack_nova_service_state) by (binary) |
| labels: |
| severity: critical |
| service: nova |
| annotations: |
| summary: "{{ $labels.binary }} service outage" |
| description: >- |
| All {{ $labels.binary }} services are down. |
| NovaTotalFreeVCPUsLow: |
| if: >- |
| (100.0 * openstack_nova_total_free_vcpus) / (openstack_nova_total_free_vcpus + openstack_nova_total_used_vcpus) < 10.0 |
| for: 1m |
| labels: |
| severity: warning |
| service: nova |
| annotations: |
| summary: "VCPU low limit for new instances" |
| description: >- |
| VPCU low limit for 1 minutes |
| NovaTotalFreeMemoryLow: |
| if: >- |
| (100.0 * openstack_nova_total_free_ram) / (openstack_nova_total_free_ram + openstack_nova_total_used_ram) < 10.0 |
| for: 1m |
| labels: |
| severity: warning |
| service: nova |
| annotations: |
| summary: "Memory low limit for new instances" |
| description: >- |
| Memory low limit for 1 minutes |
| NovaTotalFreeVCPUsShortage: |
| if: >- |
| (100.0 * openstack_nova_total_free_vcpus) / (openstack_nova_total_free_vcpus + openstack_nova_total_used_vcpus) < 2.0 |
| for: 1m |
| labels: |
| severity: critical |
| service: nova |
| annotations: |
| summary: "VCPU shortage for new instances" |
| description: >- |
| VPCU shortage for 1 minutes |
| NovaTotalFreeMemoryShortage: |
| if: >- |
| (100.0 * openstack_nova_total_free_ram) / (openstack_nova_total_free_ram + openstack_nova_total_used_ram) < 2.0 |
| for: 1m |
| labels: |
| severity: critical |
| service: nova |
| annotations: |
| summary: "Memory shortage for new instances" |
| description: >- |
| Memory shortage for 1 minutes |
| NovaAggregatesFreeVCPUsLow: |
| if: >- |
| (100.0 * openstack_nova_aggregate_free_vcpus) / (openstack_nova_aggregate_free_vcpus + openstack_nova_aggregate_used_vcpus) < 10.0 |
| for: 1m |
| labels: |
| severity: warning |
| service: nova |
| aggregate: "{{ $labels.aggregate }}" |
| annotations: |
| summary: "VCPU low limit for new instances on aggregate {{ $labels.aggregate }}" |
| description: >- |
| VPCU low limit for 1 minutes on aggregate {{ $labels.aggregate }} |
| NovaAggregatesFreeMemoryLow: |
| if: >- |
| (100.0 * openstack_nova_aggregate_free_ram) / (openstack_nova_aggregate_free_ram + openstack_nova_aggregate_used_ram) < 10.0 |
| for: 1m |
| labels: |
| severity: warning |
| service: nova |
| aggregate: "{{ $labels.aggregate }}" |
| annotations: |
| summary: "Memory low limit for new instances on aggregate {{ $labels.aggregate }}" |
| description: >- |
| Memory low limit for 1 minutes on aggregate {{ $labels.aggregate }} |
| NovaAggregatesFreeVCPUsShortage: |
| if: >- |
| (100.0 * openstack_nova_aggregate_free_vcpus) / (openstack_nova_aggregate_free_vcpus + openstack_nova_aggregate_used_vcpus) < 2.0 |
| for: 1m |
| labels: |
| severity: critical |
| service: nova |
| aggregate: "{{ $labels.aggregate }}" |
| annotations: |
| summary: "VCPU shortage for new instances on aggregate {{ $labels.aggregate }}" |
| description: >- |
| VPCU shortage for 1 minutes on aggregate {{ $labels.aggregate }} |
| NovaAggregatesFreeMemoryShortage: |
| if: >- |
| (100.0 * openstack_nova_aggregate_free_ram) / (openstack_nova_aggregate_free_ram + openstack_nova_aggregate_used_ram) < 2.0 |
| for: 1m |
| labels: |
| severity: critical |
| service: nova |
| aggregate: "{{ $labels.aggregate }}" |
| annotations: |
| summary: "Memory shortage for new instances on aggregate {{ $labels.aggregate }}" |
| description: >- |
| Memory shortage for 1 minutes on aggregate {{ $labels.aggregate }} |
| {%- endraw %} |
| {%- endif %} |
| NovaErrorLogsTooHigh: |
| {%- set log_threshold = monitoring.error_log_rate.warn|float %} |
| if: >- |
| sum(rate(log_messages{service="nova",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > {{ log_threshold }} |
| {%- raw %} |
| labels: |
| severity: warning |
| service: nova |
| annotations: |
| summary: "High number of errors in Nova logs" |
| description: "The rate of errors in Nova logs over the last 5 minutes is too high on the {{ $labels.host }} node (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }})." |
| {%- if is_compute and exporters is defined %} |
| {%- raw %} |
| LibvirtDown: |
| if: >- |
| libvirt_up == 0 |
| for: 2m |
| labels: |
| severity: critical |
| service: libvirt |
| annotations: |
| summary: "Failure to gather Libvirt metrics" |
| description: "The Libvirt metric exporter fails to gather metrics on the {{ $labels.host }} node for at least 2 minutes." |
| {%- endraw %} |
| {%- include "prometheus/_exporters_config.sls" %} |
| {%- endif %} |
| {%- endif %} |