Merge "Add prometheus main dashboard"
diff --git a/heat/map.jinja b/heat/map.jinja
index b17f7d5..2a2d972 100644
--- a/heat/map.jinja
+++ b/heat/map.jinja
@@ -32,5 +32,7 @@
{% set monitoring = salt['grains.filter_by']({
'default': {
'error_log_rate': 0.2,
+ 'services_failed_warning_threshold_percent': 0.3,
+ 'services_failed_critical_threshold_percent': 0.6,
},
}, grain='os_family', merge=salt['pillar.get']('heat:monitoring')) %}
diff --git a/heat/meta/prometheus.yml b/heat/meta/prometheus.yml
index a8dee85..b9500c0 100644
--- a/heat/meta/prometheus.yml
+++ b/heat/meta/prometheus.yml
@@ -16,6 +16,50 @@
summary: "Endpoint check for '{{ $labels.service }}' is down"
description: >-
Endpoint check for '{{ $labels.service }}' is down for 2 minutes
+ HeatAPIServicesInfo:
+ if: >-
+ http_response_status{service=~"heat.*-api"} == 0
+ for: 2m
+ labels:
+ severity: info
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "HTTP check for '{{ $labels.service }}' down"
+ description: >-
+ The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes.
+ HeatAPIServicesWarning:
+ if: >-
+ count(http_response_status{service=~"heat.*-api"} == 0) by (service) >= on (service) count(http_response_status{service=~"heat.*-api"}) by (service) * {%- endraw %} {{monitoring.services_failed_warning_threshold_percent}} {%- raw %}
+ for: 2m
+ labels:
+ severity: warning
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
+ description: >-
+ {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})
+ HeatAPIServicesCritical:
+ if: >-
+ count(http_response_status{service=~"heat.*-api"} == 0) by (service) >= on (service) count(http_response_status{service=~"heat.*-api"}) by (service) * {%- endraw %} {{monitoring.services_failed_critical_threshold_percent}} {%- raw %}
+ for: 2m
+ labels:
+ severity: critical
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
+ description: >-
+ {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})
+ HeatAPIServicesDown:
+ if: >-
+ count(http_response_status{service=~"heat.*-api"} == 0) by (service) == on (service) count(http_response_status{service=~"heat.*-api"}) by (service)
+ for: 2m
+ labels:
+ severity: down
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "All {{ $labels.service }} services are down"
+ description: >-
+ All {{ $labels.service }} services are down for the last 2 minutes
HeatErrorLogsTooHigh:
{%- endraw %}
{%- set log_threshold = monitoring.error_log_rate|float %}