Merge "Add prometheus main dashboard"
diff --git a/heat/map.jinja b/heat/map.jinja
index b17f7d5..2a2d972 100644
--- a/heat/map.jinja
+++ b/heat/map.jinja
@@ -32,5 +32,7 @@
 {% set monitoring = salt['grains.filter_by']({
     'default': {
         'error_log_rate': 0.2,
+        'services_failed_warning_threshold_percent': 0.3,
+        'services_failed_critical_threshold_percent': 0.6,
     },
 }, grain='os_family', merge=salt['pillar.get']('heat:monitoring')) %}
diff --git a/heat/meta/prometheus.yml b/heat/meta/prometheus.yml
index a8dee85..b9500c0 100644
--- a/heat/meta/prometheus.yml
+++ b/heat/meta/prometheus.yml
@@ -16,6 +16,50 @@
         summary: "Endpoint check for '{{ $labels.service }}' is down"
         description: >-
             Endpoint check for '{{ $labels.service }}' is down for 2 minutes
+    HeatAPIServicesInfo:
+      if: >-
+        http_response_status{service=~"heat.*-api"} == 0
+      for: 2m
+      labels:
+        severity: info
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "HTTP check for '{{ $labels.service }}' down"
+        description: >-
+            The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes.
+    HeatAPIServicesWarning:
+      if: >-
+          count(http_response_status{service=~"heat.*-api"} == 0) by (service) >= on (service) count(http_response_status{service=~"heat.*-api"}) by (service) * {%- endraw %} {{monitoring.services_failed_warning_threshold_percent}} {%- raw %}
+      for: 2m
+      labels:
+        severity: warning
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
+        description: >-
+            {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})
+    HeatAPIServicesCritical:
+      if: >-
+          count(http_response_status{service=~"heat.*-api"} == 0) by (service) >= on (service) count(http_response_status{service=~"heat.*-api"}) by (service) * {%- endraw %} {{monitoring.services_failed_critical_threshold_percent}} {%- raw %}
+      for: 2m
+      labels:
+        severity: critical
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
+        description: >-
+            {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})
+    HeatAPIServicesDown:
+      if: >-
+        count(http_response_status{service=~"heat.*-api"} == 0) by (service) == on (service) count(http_response_status{service=~"heat.*-api"}) by (service)
+      for: 2m
+      labels:
+        severity: down
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "All {{ $labels.service }} services are down"
+        description: >-
+            All {{ $labels.service }} services are down for the last 2 minutes
     HeatErrorLogsTooHigh:
 {%- endraw %}
       {%- set log_threshold = monitoring.error_log_rate|float %}