Align openstack API downtime calculation
1. Divide OpenStack API check from metrics
2. Add recording rule to count api down of 5 consecutive minutes
Related-Prod: PROD-31017
Change-Id: Id21a3d8e367afbc24e3a71f788fa33d60bdb39d4
diff --git a/keystone/map.jinja b/keystone/map.jinja
index cbf1f03..8c18968 100644
--- a/keystone/map.jinja
+++ b/keystone/map.jinja
@@ -112,5 +112,9 @@
'telegraf_openstack_input_plugin_interval': '3m',
'telegraf_openstack_output_plugin_expiration_interval': '6m',
'output_openstack_port': 9127,
+ 'telegraf_openstack_api_input_plugin_interval': '15s',
+ 'telegraf_openstack_api_expiration_interval': '30s',
+ 'output_openstack_api_port': 9128,
+ 'api_monitoring_duration' : '5m'
},
}, grain='os_family', merge=salt['pillar.get']('keystone:monitoring')) %}
diff --git a/keystone/meta/prometheus.yml b/keystone/meta/prometheus.yml
index 7378f76..c2fb6c8 100644
--- a/keystone/meta/prometheus.yml
+++ b/keystone/meta/prometheus.yml
@@ -10,6 +10,11 @@
- 'tasks.monitoring_remote_agent'
type: A
port: '{{ monitoring.output_openstack_port }}'
+ - name: 'remote_agent_openstack_api'
+ domain:
+ - 'tasks.monitoring_remote_agent'
+ type: A
+ port: '{{ monitoring.output_openstack_api_port }}'
{%- set major_threshold = monitoring.endpoint_failed_major_threshold|float %}
{% raw %}
alert:
@@ -83,3 +88,14 @@
summary: "High response time of Keystone API"
description: "The Keystone API response time for GET and POST requests on the {{ $labels.host }} node is higher than {% endraw %}{{response_time_threshold}}s for 2 minutes."
{%- endif %}
+{%- set range_duration = monitoring.api_monitoring_duration %}
+ recording:
+ name:openstack_api_check_status:
+ query: >-
+ avg(openstack_api_check_status) by (name)
+ name:openstack_api_check_status:avg{{ range_duration }}:for{{ range_duration }}:ceil:
+ query: >-
+ ceil(avg_over_time(name:openstack_api_check_status[{{ range_duration }}])) and name:openstack_api_check_status and name:openstack_api_check_status offset {{ range_duration }}
+ name:openstack_api_check_status:avg{{ range_duration }}:for{{ range_duration }}:ceil:avg{{ range_duration }}:floor:
+ query: >-
+ floor(avg_over_time(name:openstack_api_check_status:avg{{ range_duration }}:for{{ range_duration }}:ceil[{{ range_duration }}]))
diff --git a/keystone/meta/telegraf.yml b/keystone/meta/telegraf.yml
index 2c25702..19882c7 100644
--- a/keystone/meta/telegraf.yml
+++ b/keystone/meta/telegraf.yml
@@ -1,17 +1,27 @@
{%- from "keystone/map.jinja" import monitoring, server with context %}
{%- if server is defined and server.get('enabled', False) %}
-{%- set openstack_input_interval = monitoring.telegraf_openstack_input_plugin_interval %}
-{%- set openstack_output_expiration_interval = monitoring.telegraf_openstack_output_plugin_expiration_interval %}
-
remote_agent:
input:
openstack:
- interval: "{{ openstack_input_interval }}"
+ template: telegraf/files/input/openstack.conf
+ interval: "{{ monitoring.telegraf_openstack_input_plugin_interval }}"
project: "{{ server.admin_tenant }}"
tenant: "{{ server.admin_tenant }}"
region: "{{ server.region }}"
username: "{{ server.admin_name }}"
password: "{{ server.admin_password }}"
+ services: ["cinder", "glance", "keystone", "keystonev3", "neutron", "nova", "nova_instances"]
+ identity_endpoint: "{{ server.bind.private_protocol }}://{{ server.bind.private_address|replace('0.0.0.0', '127.0.0.1') }}:{{ server.bind.private_port }}/"
+ monitor_agents: "true"
+ openstack_api:
+ template: telegraf/files/input/openstack.conf
+ interval: "{{ monitoring.telegraf_openstack_api_input_plugin_interval }}"
+ project: "{{ server.admin_tenant }}"
+ tenant: "{{ server.admin_tenant }}"
+ region: "{{ server.region }}"
+ username: "{{ server.admin_name }}"
+ password: "{{ server.admin_password }}"
+ services: ["checks"]
identity_endpoint: "{{ server.bind.private_protocol }}://{{ server.bind.private_address|replace('0.0.0.0', '127.0.0.1') }}:{{ server.bind.private_port }}/"
monitor_agents: "true"
output:
@@ -23,9 +33,20 @@
bind:
address: 0.0.0.0
port: "{{ monitoring.output_openstack_port }}"
- expiration_interval: "{{ openstack_output_expiration_interval }}"
+ expiration_interval: "{{ monitoring.telegraf_openstack_output_plugin_expiration_interval }}"
# Measurement filtering
namepass: ["openstack*"]
+ namedrop: ["openstack_api*" ]
+ prometheus_client_openstack_api:
+ template: telegraf/files/output/prometheus_client.conf
+ engine: prometheus
+ # Output plugin configuration
+ bind:
+ address: 0.0.0.0
+ port: "{{ monitoring.output_openstack_api_port }}"
+ expiration_interval: "{{ monitoring.telegraf_openstack_api_expiration_interval }}"
+ # Measurement filtering
+ namepass: ["openstack_api*"]
prometheus_client:
namedrop: ["openstack*"]
agent: