Align openstack API downtime calculation

1. Divide OpenStack API check from metrics
2. Add recording rule to count api down of 5 consecutive minutes

Related-Prod: PROD-31017

Change-Id: Id21a3d8e367afbc24e3a71f788fa33d60bdb39d4
diff --git a/keystone/map.jinja b/keystone/map.jinja
index cbf1f03..8c18968 100644
--- a/keystone/map.jinja
+++ b/keystone/map.jinja
@@ -112,5 +112,9 @@
         'telegraf_openstack_input_plugin_interval': '3m',
         'telegraf_openstack_output_plugin_expiration_interval': '6m',
         'output_openstack_port': 9127,
+        'telegraf_openstack_api_input_plugin_interval': '15s',
+        'telegraf_openstack_api_expiration_interval': '30s',
+        'output_openstack_api_port': 9128,
+        'api_monitoring_duration' : '5m'
     },
 }, grain='os_family', merge=salt['pillar.get']('keystone:monitoring')) %}
diff --git a/keystone/meta/prometheus.yml b/keystone/meta/prometheus.yml
index 7378f76..c2fb6c8 100644
--- a/keystone/meta/prometheus.yml
+++ b/keystone/meta/prometheus.yml
@@ -10,6 +10,11 @@
           - 'tasks.monitoring_remote_agent'
           type: A
           port: '{{ monitoring.output_openstack_port }}'
+        - name: 'remote_agent_openstack_api'
+          domain:
+          - 'tasks.monitoring_remote_agent'
+          type: A
+          port: '{{ monitoring.output_openstack_api_port }}'
 {%- set major_threshold = monitoring.endpoint_failed_major_threshold|float %}
 {% raw %}
   alert:
@@ -83,3 +88,14 @@
         summary: "High response time of Keystone API"
         description: "The Keystone API response time for GET and POST requests on the {{ $labels.host }} node is higher than {% endraw %}{{response_time_threshold}}s for 2 minutes."
 {%- endif %}
+{%- set range_duration = monitoring.api_monitoring_duration %}
+  recording:
+    name:openstack_api_check_status:
+      query: >-
+        avg(openstack_api_check_status) by (name)
+    name:openstack_api_check_status:avg{{ range_duration }}:for{{ range_duration }}:ceil:
+      query: >-
+        ceil(avg_over_time(name:openstack_api_check_status[{{ range_duration }}])) and name:openstack_api_check_status and name:openstack_api_check_status offset {{ range_duration }}
+    name:openstack_api_check_status:avg{{ range_duration }}:for{{ range_duration }}:ceil:avg{{ range_duration }}:floor:
+      query: >-
+        floor(avg_over_time(name:openstack_api_check_status:avg{{ range_duration }}:for{{ range_duration }}:ceil[{{ range_duration }}]))
diff --git a/keystone/meta/telegraf.yml b/keystone/meta/telegraf.yml
index 2c25702..19882c7 100644
--- a/keystone/meta/telegraf.yml
+++ b/keystone/meta/telegraf.yml
@@ -1,17 +1,27 @@
 {%- from "keystone/map.jinja" import monitoring, server with context %}
 {%- if server is defined and server.get('enabled', False) %}
-{%- set openstack_input_interval = monitoring.telegraf_openstack_input_plugin_interval %}
-{%- set openstack_output_expiration_interval = monitoring.telegraf_openstack_output_plugin_expiration_interval %}
-
 remote_agent:
   input:
     openstack:
-      interval: "{{ openstack_input_interval }}"
+      template: telegraf/files/input/openstack.conf
+      interval: "{{ monitoring.telegraf_openstack_input_plugin_interval }}"
       project: "{{ server.admin_tenant }}"
       tenant: "{{ server.admin_tenant }}"
       region: "{{ server.region }}"
       username: "{{ server.admin_name }}"
       password: "{{ server.admin_password }}"
+      services: ["cinder", "glance", "keystone", "keystonev3", "neutron", "nova", "nova_instances"]
+      identity_endpoint: "{{ server.bind.private_protocol }}://{{ server.bind.private_address|replace('0.0.0.0', '127.0.0.1') }}:{{ server.bind.private_port }}/"
+      monitor_agents: "true"
+    openstack_api:
+      template: telegraf/files/input/openstack.conf
+      interval: "{{ monitoring.telegraf_openstack_api_input_plugin_interval }}"
+      project: "{{ server.admin_tenant }}"
+      tenant: "{{ server.admin_tenant }}"
+      region: "{{ server.region }}"
+      username: "{{ server.admin_name }}"
+      password: "{{ server.admin_password }}"
+      services: ["checks"]
       identity_endpoint: "{{ server.bind.private_protocol }}://{{ server.bind.private_address|replace('0.0.0.0', '127.0.0.1') }}:{{ server.bind.private_port }}/"
       monitor_agents: "true"
   output:
@@ -23,9 +33,20 @@
       bind:
         address: 0.0.0.0
         port: "{{ monitoring.output_openstack_port }}"
-      expiration_interval: "{{ openstack_output_expiration_interval }}"
+      expiration_interval: "{{ monitoring.telegraf_openstack_output_plugin_expiration_interval }}"
       # Measurement filtering
       namepass: ["openstack*"]
+      namedrop: ["openstack_api*" ]
+    prometheus_client_openstack_api:
+      template: telegraf/files/output/prometheus_client.conf
+      engine: prometheus
+      # Output plugin configuration
+      bind:
+        address: 0.0.0.0
+        port: "{{ monitoring.output_openstack_api_port }}"
+      expiration_interval: "{{ monitoring.telegraf_openstack_api_expiration_interval }}"
+      # Measurement filtering
+      namepass: ["openstack_api*"]
     prometheus_client:
       namedrop: ["openstack*"]
 agent: