Enable Prometheus support
This change adds alerts on API status and log metrics.
Change-Id: I443c128ead4d33c4e2887034bdebd09dcd8837b4
diff --git a/heat/meta/prometheus.yml b/heat/meta/prometheus.yml
new file mode 100644
index 0000000..dc3d01c
--- /dev/null
+++ b/heat/meta/prometheus.yml
@@ -0,0 +1,32 @@
+{%- if pillar.heat.server is defined %}
+
+{%- from "heat/map.jinja" import server with context %}
+{%- if server.get('enabled', False) %}
+{% raw %}
+server:
+ alert:
+ HeatAPIDown:
+ if: >-
+ max(openstack_api_check_status{service=~"heat.+"}) by (service) == 0
+ for: 2m
+ labels:
+ severity: down
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "Endpoint check for '{{ $labels.service }}' is down"
+ description: >-
+ Endpoint check for '{{ $labels.service }}' is down for 2 minutes
+ HeatErrorLogsTooHigh:
+{%- endraw %}
+ {%- set log_threshold = prometheus_server.get('alert', {}).get('HeatErrorLogsTooHigh', {}).get('var', {}).get('threshold', 0.2 ) %}
+ if: >-
+ sum(rate(log_messages{service="heat",level=~"error|emergency|fatal"}[5m])) without (level) > {{ log_threshold }}
+{%- raw %}
+ labels:
+ severity: warning
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: 'Too many errors in {{ $labels.service }} logs'
+ description: 'The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }}).'
+{%- endif %}
+{%- endif %}
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index 27889ff..e39cbbd 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -15,3 +15,5 @@
enabled: true
telegraf:
enabled: true
+ prometheus:
+ enabled: true