Merge "Add prometheus main dashboard"

commit: d57cbf9ff3cdaf54eccff063fb51555d511e544d [log] [tgz]
author: Martin Polreich <mpolreich@mirantis.com> Wed Oct 18 11:21:48 2017 +0000
committer: Gerrit Code Review <gerrit2@7cd0c2eb159e> Wed Oct 18 11:21:48 2017 +0000
tree: 751fc2a1bd8b95b8da2dae7afc9685925f073e37
parent: f93021379ff160032ded7927fbe087aceeb6f41d [diff]
parent: 4afa4a7762ae63c9e36f83e537531cc5e4d4fa4b [diff]
diff --git a/heat/map.jinja b/heat/map.jinja
index b17f7d5..2a2d972 100644
--- a/heat/map.jinja
+++ b/heat/map.jinja

@@ -32,5 +32,7 @@
 {% set monitoring = salt['grains.filter_by']({
     'default': {
         'error_log_rate': 0.2,
+        'services_failed_warning_threshold_percent': 0.3,
+        'services_failed_critical_threshold_percent': 0.6,
     },
 }, grain='os_family', merge=salt['pillar.get']('heat:monitoring')) %}

diff --git a/heat/meta/prometheus.yml b/heat/meta/prometheus.yml
index a8dee85..b9500c0 100644
--- a/heat/meta/prometheus.yml
+++ b/heat/meta/prometheus.yml

@@ -16,6 +16,50 @@
         summary: "Endpoint check for '{{ $labels.service }}' is down"
         description: >-
             Endpoint check for '{{ $labels.service }}' is down for 2 minutes
+    HeatAPIServicesInfo:
+      if: >-
+        http_response_status{service=~"heat.*-api"} == 0
+      for: 2m
+      labels:
+        severity: info
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "HTTP check for '{{ $labels.service }}' down"
+        description: >-
+            The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes.
+    HeatAPIServicesWarning:
+      if: >-
+          count(http_response_status{service=~"heat.*-api"} == 0) by (service) >= on (service) count(http_response_status{service=~"heat.*-api"}) by (service) * {%- endraw %} {{monitoring.services_failed_warning_threshold_percent}} {%- raw %}
+      for: 2m
+      labels:
+        severity: warning
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
+        description: >-
+            {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})
+    HeatAPIServicesCritical:
+      if: >-
+          count(http_response_status{service=~"heat.*-api"} == 0) by (service) >= on (service) count(http_response_status{service=~"heat.*-api"}) by (service) * {%- endraw %} {{monitoring.services_failed_critical_threshold_percent}} {%- raw %}
+      for: 2m
+      labels:
+        severity: critical
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
+        description: >-
+            {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})
+    HeatAPIServicesDown:
+      if: >-
+        count(http_response_status{service=~"heat.*-api"} == 0) by (service) == on (service) count(http_response_status{service=~"heat.*-api"}) by (service)
+      for: 2m
+      labels:
+        severity: down
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "All {{ $labels.service }} services are down"
+        description: >-
+            All {{ $labels.service }} services are down for the last 2 minutes
     HeatErrorLogsTooHigh:
 {%- endraw %}
       {%- set log_threshold = monitoring.error_log_rate|float %}
commit	d57cbf9ff3cdaf54eccff063fb51555d511e544d	[log] [tgz]
author	Martin Polreich <mpolreich@mirantis.com>	Wed Oct 18 11:21:48 2017 +0000
committer	Gerrit Code Review <gerrit2@7cd0c2eb159e>	Wed Oct 18 11:21:48 2017 +0000
tree	751fc2a1bd8b95b8da2dae7afc9685925f073e37
parent	f93021379ff160032ded7927fbe087aceeb6f41d [diff]
parent	4afa4a7762ae63c9e36f83e537531cc5e4d4fa4b [diff]