Add Prometheus alerts
Change-Id: I7e3616c49eb0e20be37ba0fbcf396f80aada672e
diff --git a/nova/meta/prometheus.yml b/nova/meta/prometheus.yml
index d38dd88..0f2ce23 100644
--- a/nova/meta/prometheus.yml
+++ b/nova/meta/prometheus.yml
@@ -20,6 +20,156 @@
summary: "Endpoint check for '{{ $labels.service }}' is down"
description: >-
Endpoint check for '{{ $labels.service }}' is down for 2 minutes
+ NovaSomeServicesDown:
+ if: >-
+ openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} > 0 and ignoring(state) openstack_nova_services{state="up",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= 2
+ for: 2m
+ labels:
+ severity: warning
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "Some {{ $labels.service }} services down"
+ description: >-
+ {{ $value }} '{{ $labels.service }}' service(s) is/are down for 2 minutes
+ NovaOnlyOneServiceUp:
+ if: >-
+ openstack_nova_services{state="up",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} == 1 and ignoring(state) openstack_nova_services{state=~"down|disabled",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} > 0
+ for: 2m
+ labels:
+ severity: critical
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "Only one {{ $labels.service }} service up"
+ description: >-
+ Only one '{{ $labels.service }}' service is up for 2 minutes
+ NovaAllServicesDown:
+ if: >-
+ openstack_nova_services{state="up",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} == 0
+ for: 2m
+ labels:
+ severity: down
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "All {{ $labels.service }} services down"
+ description: >-
+ All '{{ $labels.service }}' services are down for 2 minutes
+ NovaSomeComputesDown:
+ if: >-
+ openstack_nova_services{state="down",service=~"nova-compute"} > 0
+ for: 2m
+ labels:
+ severity: warning
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "Some {{ $labels.service }} services down"
+ description: >-
+ {{ $value }} '{{ $labels.service }}' service(s) is/are down for 2 minutes
+ NovaMajorityComputesDown:
+ if: >-
+ openstack_nova_services_percent{state="down",service=~"nova-compute"} > 50
+ for: 2m
+ labels:
+ severity: critical
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "Only one {{ $labels.service }} service up"
+ description: >-
+ Only one '{{ $labels.service }}' service is up for 2 minutes
+ NovaAllComputesDown:
+ if: >-
+ openstack_nova_services{state="up",service=~"nova-compute"} == 0
+ for: 2m
+ labels:
+ severity: down
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "All {{ $labels.service }} services down"
+ description: >-
+ All '{{ $labels.service }}' services are down for 2 minutes
+ NovaTotalFreeVCPUsLow:
+ if: >-
+ (100.0 * openstack_nova_total_free_vcpus) / (openstack_nova_total_free_vcpus + openstack_nova_total_used_vcpus) < 10.0
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: "VCPU low limit for new instances"
+ description: >-
+ VPCU low limit for 1 minutes
+ NovaTotalFreeMemoryLow:
+ if: >-
+ (100.0 * openstack_nova_total_free_ram) / (openstack_nova_total_free_ram + openstack_nova_total_used_ram) < 10.0
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Memory low limit for new instances"
+ description: >-
+ Memory low limit for 1 minutes
+ NovaTotalFreeVCPUsShortage:
+ if: >-
+ (100.0 * openstack_nova_total_free_vcpus) / (openstack_nova_total_free_vcpus + openstack_nova_total_used_vcpus) < 2.0
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ summary: "VCPU shortage for new instances"
+ description: >-
+ VPCU shortage for 1 minutes
+ NovaTotalFreeMemoryShortage:
+ if: >-
+ (100.0 * openstack_nova_total_free_ram) / (openstack_nova_total_free_ram + openstack_nova_total_used_ram) < 2.0
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Memory shortage for new instances"
+ description: >-
+ Memory shortage for 1 minutes
+ NovaAggregatesFreeVCPUsLow:
+ if: >-
+ (100.0 * openstack_nova_aggregate_free_vcpus) / (openstack_nova_aggregate_free_vcpus + openstack_nova_aggregate_used_vcpus) < 10.0
+ for: 1m
+ labels:
+ severity: warning
+ aggregate: "{{ $labels.aggregate }}"
+ annotations:
+ summary: "VCPU low limit for new instances on aggregate {{ $labels.aggregate }}"
+ description: >-
+ VPCU low limit for 1 minutes on aggregate {{ $labels.aggregate }}
+ NovaAggregatesFreeMemoryLow:
+ if: >-
+ (100.0 * openstack_nova_aggregate_free_ram) / (openstack_nova_aggregate_free_ram + openstack_nova_aggregate_used_ram) < 10.0
+ for: 1m
+ labels:
+ severity: warning
+ aggregate: "{{ $labels.aggregate }}"
+ annotations:
+ summary: "Memory low limit for new instances on aggregate {{ $labels.aggregate }}"
+ description: >-
+ Memory low limit for 1 minutes on aggregate {{ $labels.aggregate }}
+ NovaAggregatesFreeVCPUsShortage:
+ if: >-
+ (100.0 * openstack_nova_aggregate_free_vcpus) / (openstack_nova_aggregate_free_vcpus + openstack_nova_aggregate_used_vcpus) < 2.0
+ for: 1m
+ labels:
+ severity: critical
+ aggregate: "{{ $labels.aggregate }}"
+ annotations:
+ summary: "VCPU shortage for new instances on aggregate {{ $labels.aggregate }}"
+ description: >-
+ VPCU shortage for 1 minutes on aggregate {{ $labels.aggregate }}
+ NovaAggregatesFreeMemoryShortage:
+ if: >-
+ (100.0 * openstack_nova_aggregate_free_ram) / (openstack_nova_aggregate_free_ram + openstack_nova_aggregate_used_ram) < 2.0
+ for: 1m
+ labels:
+ severity: critical
+ aggregate: "{{ $labels.aggregate }}"
+ annotations:
+ summary: "Memory shortage for new instances on aggregate {{ $labels.aggregate }}"
+ description: >-
+ Memory shortage for 1 minutes on aggregate {{ $labels.aggregate }}
{%- endraw %}
{%- endif %}
NovaErrorLogsTooHigh:
@@ -35,6 +185,19 @@
description: 'The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }}).'
{%- if is_compute %}
+{%- raw %}
+ NovaLibvirtDown:
+ if: >-
+ max(libvirt_up) by (host) == 0
+ for: 2m
+ labels:
+ severity: down
+ service: "libvirt"
+ annotations:
+ summary: "libvirt check on '{{ $labels.host }}' is down"
+ description: >-
+ libvirt check on '{{ $labels.host }}' is down for 2 minutes
+{%- endraw %}
{%- from "prometheus/map.jinja" import exporters with context %}
{%- if exporters %}
target: