Merge "Add Barbican integration"
diff --git a/cinder/map.jinja b/cinder/map.jinja
index 134ac55..b8806e6 100644
--- a/cinder/map.jinja
+++ b/cinder/map.jinja
@@ -89,5 +89,7 @@
 {% set monitoring = salt['grains.filter_by']({
     'default': {
         'error_log_rate': 0.2,
+        'services_failed_warning_threshold_percent': 0.3,
+        'services_failed_critical_threshold_percent': 0.6,
     },
 }, grain='os_family', merge=salt['pillar.get']('cinder:monitoring')) %}
diff --git a/cinder/meta/grafana.yml b/cinder/meta/grafana.yml
index 3d30b8c..0d0ecc8 100644
--- a/cinder/meta/grafana.yml
+++ b/cinder/meta/grafana.yml
@@ -69,7 +69,7 @@
                 alias: "Fatal"
                 rawQuery: true
                 query: SELECT count(max) FROM openstack_cinder_http_response_times WHERE environment_label = '$environment' AND http_status = '5xx' AND $timeFilter
-  main:
+  main_influxdb:
     datasource: influxdb
     row:
       ost-control-plane:
@@ -98,3 +98,18 @@
               cluster_status:
                 rawQuery: true
                 query: SELECT last(value) FROM cluster_status WHERE cluster_name = 'cinder-data' AND environment_label = '$environment' AND $timeFilter GROUP BY time($interval) fill(null)
+  main_prometheus:
+    datasource: prometheus
+    row:
+      ost-control-plane:
+        title: OpenStack Control Plane
+        panel:
+          cinder:
+            title: Cinder
+            links:
+            - dashboard: Cinder
+              title: Cinder
+              type: dashboard
+            target:
+              cluster_status:
+                expr: avg(openstack_api_check_status{service=~"cinder.*"})
diff --git a/cinder/meta/prometheus.yml b/cinder/meta/prometheus.yml
index d4d3780..dca35fb 100644
--- a/cinder/meta/prometheus.yml
+++ b/cinder/meta/prometheus.yml
@@ -17,41 +17,52 @@
       annotations:
         summary: "Endpoint check for '{{ $labels.service }}' is down"
         description: >-
-            Endpoint check for '{{ $labels.service }}' is down for 2 minutes
-    CinderAPIServiceDown:
+            Endpoint check for '{{ $labels.service }}' is down for the last 2 minutes
+    CinderAPIServiceInfo:
       if: >-
         http_response_status{service=~"cinder-api"} == 0
       for: 2m
       labels:
-        severity: down
+        severity: info
         service: "{{ $labels.service }}"
       annotations:
         summary: "HTTP check for '{{ $labels.service }}' down"
         description: >-
-            The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for 2 minutes.
-    CinderSomeServicesDown:
+            The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes.
+    CinderServicesInfo:
       if: >-
-          openstack_cinder_services{state="down",service=~"cinder-volume|cinder-scheduler"} > 0 and ignoring (state) openstack_cinder_services{state="up",service=~"cinder-volume|cinder-scheduler"} >= 2
+          openstack_cinder_service == 1
+      for: 2m
+      labels:
+        severity: info
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "'{{ $labels.service }}' is down"
+        description: >-
+            '{{ $labels.service }}' is down on {{ $labels.hostname }} for the last 2 minutes.
+    CinderServicesWarning:
+      if: >-
+          openstack_cinder_services{service=~"cinder-volume|cinder-scheduler", state="down"} >= on (service) sum(openstack_cinder_services{service=~"cinder-volume|cinder-scheduler"}) by (service) * {%- endraw %} {{monitoring.services_failed_warning_threshold_percent}} {%- raw %}
       for: 2m
       labels:
         severity: warning
         service: "{{ $labels.service }}"
       annotations:
-        summary: "Some {{ $labels.service }} services are down"
+        summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
         description: >-
-            {{ $value }} {{ $labels.service }} services are down for 2 minutes
-    CinderOnlyOneServiceUp:
+            {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})
+    CinderServicesCritical:
       if: >-
-          openstack_cinder_services{state="up",service=~"cinder-volume|cinder-scheduler"} == 1 and ignoring (state) openstack_cinder_services{state="down",service=~"cinder-volume|cinder-scheduler"} > 0
+          openstack_cinder_services{service=~"cinder-volume|cinder-scheduler", state="down"} >= on (service) sum(openstack_cinder_services{service=~"cinder-volume|cinder-scheduler"}) by (service) * {%- endraw %} {{monitoring.services_failed_critical_threshold_percent}} {%- raw %}
       for: 2m
       labels:
         severity: critical
         service: "{{ $labels.service }}"
       annotations:
-        summary: "Only one {{ $labels.service }} service is up"
+        summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.service }} services are down"
         description: >-
-            Only one {{ $labels.service }} service is up for 2 minutes
-    CinderAllServicesDown:
+            {{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})
+    CinderServicesDown:
       if: >-
         openstack_cinder_services{state="up",service=~"cinder-volume|cinder-scheduler"} == 0
       for: 2m
@@ -61,7 +72,7 @@
       annotations:
         summary: "All {{ $labels.service }} services are down"
         description: >-
-            All {{ $labels.service }} services are down for 2 minutes
+            All {{ $labels.service }} services are down for the last 2 minutes
 {%- endraw %}
 {%- endif %}
     CinderErrorLogsTooHigh: