Openstack API check improvements
- show public endpoint on the dasboard;
- since all endpoits in Service Catalog are behind haproxy we should
treat outage of any of those as the critical issue;
- switch from "name" to "service_name" label.
- replace component alerts by generic OpenstackServiceEndpointDown.
Change-Id: Id15c297fd08ea09cfa4eadf08e40c3972bdcfb48
Related-PROD: PROD-35549
diff --git a/keystone/files/grafana_dashboards/keystone_prometheus.json b/keystone/files/grafana_dashboards/keystone_prometheus.json
index c2d08d6..4c378d5 100755
--- a/keystone/files/grafana_dashboards/keystone_prometheus.json
+++ b/keystone/files/grafana_dashboards/keystone_prometheus.json
@@ -71,16 +71,15 @@
"tableColumn": "",
"targets": [
{
- "expr": "min(openstack_api_check_status{service=~\"keystone.*public.*\"})",
+ "expr": "min(openstack_api_check_status{interface=~\"public|\",service_name=~\"keystone.*\"})",
"format": "time_series",
"intervalFactor": 2,
- "legendFormat": "{{ service }}",
"refId": "A",
"step": 60
}
],
"thresholds": "0.5,1.5",
- "title": "API Availability",
+ "title": "Public VIP API Availability",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
diff --git a/keystone/files/grafana_dashboards/keystone_prometheus_fluentd.json b/keystone/files/grafana_dashboards/keystone_prometheus_fluentd.json
index ed64a67..64d9505 100755
--- a/keystone/files/grafana_dashboards/keystone_prometheus_fluentd.json
+++ b/keystone/files/grafana_dashboards/keystone_prometheus_fluentd.json
@@ -97,14 +97,14 @@
"tableColumn": "",
"targets": [
{
- "expr": "min(openstack_api_check_status{name=~\"keystone.*public.*\"})",
+ "expr": "min(openstack_api_check_status{interface=~\"public|\",service_name=~\"keystone.*\"})",
"format": "time_series",
"intervalFactor": 2,
"refId": "A"
}
],
"thresholds": "0.5,0.5",
- "title": "VIP API availability",
+ "title": "Public VIP API Availability",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
diff --git a/keystone/meta/grafana.yml b/keystone/meta/grafana.yml
index 33e9ad8..0ace907 100644
--- a/keystone/meta/grafana.yml
+++ b/keystone/meta/grafana.yml
@@ -44,7 +44,7 @@
type: dashboard
target:
cluster_status:
- expr: avg(openstack_api_check_status{service=~"keystone.*public.*"})
+ expr: min(openstack_api_check_status{interface=~"public|",service_name=~"keystone.*"})
service_level:
datasource: influxdb
row:
diff --git a/keystone/meta/prometheus.yml b/keystone/meta/prometheus.yml
index c2fb6c8..97ff533 100644
--- a/keystone/meta/prometheus.yml
+++ b/keystone/meta/prometheus.yml
@@ -1,6 +1,6 @@
-{%- if pillar.keystone.server is defined and pillar.keystone.server.get('enabled') %}
{%- from "keystone/map.jinja" import monitoring with context %}
server:
+{%- if pillar.keystone.server is defined and pillar.keystone.server.get('enabled') %}
target:
dns:
enabled: true
@@ -15,19 +15,23 @@
- 'tasks.monitoring_remote_agent'
type: A
port: '{{ monitoring.output_openstack_api_port }}'
-{%- set major_threshold = monitoring.endpoint_failed_major_threshold|float %}
-{% raw %}
+{%- endif %}
alert:
- KeystoneApiOutage:
+{%- raw %}
+ OpenstackServiceEndpointDown:
if: >-
- openstack_api_check_status{name=~"keystone.*"} == 0
+ openstack_api_check_status == 0
labels:
severity: critical
- service: keystone
+ service: openstack
annotations:
- summary: "Keystone API outage"
+ summary: "OpenStack API endpoint down"
description: >-
- Keystone API is not accessible for the Keystone endpoint in the OpenStack service catalog.
+ {{ $labels.service_name }} {{ $labels.interface }} API endpoint from the OpenStack Service Catalog didn't pass the HTTP-probe check.
+{%- endraw %}
+{%- if pillar.keystone.server is defined and pillar.keystone.server.get('enabled') %}
+{%- set major_threshold = monitoring.endpoint_failed_major_threshold|float %}
+{%- raw %}
KeystoneApiEndpointDown:
if: >-
http_response_status{name=~"keystone.*"} == 0
@@ -90,12 +94,12 @@
{%- endif %}
{%- set range_duration = monitoring.api_monitoring_duration %}
recording:
- name:openstack_api_check_status:
+ service_name:openstack_api_check_status:
query: >-
- avg(openstack_api_check_status) by (name)
- name:openstack_api_check_status:avg{{ range_duration }}:for{{ range_duration }}:ceil:
+ avg(openstack_api_check_status) by (service_name)
+ service_name:openstack_api_check_status:avg{{ range_duration }}:for{{ range_duration }}:ceil:
query: >-
- ceil(avg_over_time(name:openstack_api_check_status[{{ range_duration }}])) and name:openstack_api_check_status and name:openstack_api_check_status offset {{ range_duration }}
- name:openstack_api_check_status:avg{{ range_duration }}:for{{ range_duration }}:ceil:avg{{ range_duration }}:floor:
+ ceil(avg_over_time(service_name:openstack_api_check_status[{{ range_duration }}])) and service_name:openstack_api_check_status and service_name:openstack_api_check_status offset {{ range_duration }}
+ service_name:openstack_api_check_status:avg{{ range_duration }}:for{{ range_duration }}:ceil:avg{{ range_duration }}:floor:
query: >-
- floor(avg_over_time(name:openstack_api_check_status:avg{{ range_duration }}:for{{ range_duration }}:ceil[{{ range_duration }}]))
+ floor(avg_over_time(service_name:openstack_api_check_status:avg{{ range_duration }}:for{{ range_duration }}:ceil[{{ range_duration }}]))