Enable Prometheus support
This change adds alerts on API status and log metrics.
Change-Id: I2fe6ae3683a2e715ea9f5d4d197149284bfb75f5
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index 2c762b0..949d7cc 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -15,3 +15,5 @@
enabled: true
telegraf:
enabled: true
+ prometheus:
+ enabled: true
diff --git a/nova/meta/prometheus.yml b/nova/meta/prometheus.yml
new file mode 100644
index 0000000..2c02a06
--- /dev/null
+++ b/nova/meta/prometheus.yml
@@ -0,0 +1,36 @@
+{% from "nova/map.jinja" import controller with context %}
+{% from "nova/map.jinja" import compute with context %}
+
+{%- set is_controller = controller.get('enabled', False) %}
+{%- set is_compute = compute.get('enabled', False) %}
+
+{%- if is_controller or is_compute %}
+server:
+ alert:
+{%- if is_controller %}
+{% raw %}
+ NovaAPIDown:
+ if: >-
+ max(openstack_api_check_status{service=~"nova.+"}) by (service) == 0
+ for: 2m
+ labels:
+ severity: down
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "Endpoint check for '{{ $labels.service }}' is down"
+ description: >-
+ Endpoint check for '{{ $labels.service }}' is down for 2 minutes
+{%- endraw %}
+{%- endif %}
+ NovaErrorLogsTooHigh:
+ {%- set log_threshold = prometheus_server.get('alert', {}).get('NovaErrorLogsTooHigh', {}).get('var', {}).get('threshold', 0.2 ) %}
+ if: >-
+ sum(rate(log_messages{service="nova",level=~"error|emergency|fatal"}[5m])) without (level) > {{ log_threshold }}
+{%- raw %}
+ labels:
+ severity: warning
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: 'Too many errors in {{ $labels.service }} logs'
+ description: 'The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }}).'
+{%- endif %}