Rework Neutron alerts
Change-Id: I835212a8bcf6c9b62ad9ffd0b6df93fea9c62ab5
Related-PROD: PROD-20029
diff --git a/neutron/map.jinja b/neutron/map.jinja
index c509b31..e474ef0 100644
--- a/neutron/map.jinja
+++ b/neutron/map.jinja
@@ -169,5 +169,8 @@
{% set monitoring = salt['grains.filter_by']({
'default': {
'error_log_rate': 0.2,
+ 'endpoint_failed_major_threshold': 0.5,
+ 'agents_failed_minor_threshold': 0.3,
+ 'agents_failed_major_threshold': 0.6,
},
}, grain='os_family', merge=salt['pillar.get']('neutron:monitoring')) %}
diff --git a/neutron/meta/prometheus.yml b/neutron/meta/prometheus.yml
index 4f8471c..35b8449 100644
--- a/neutron/meta/prometheus.yml
+++ b/neutron/meta/prometheus.yml
@@ -1,78 +1,113 @@
{%- from "neutron/map.jinja" import server, monitoring with context %}
{%- if server.get('enabled', False) %}
+{%- set major_threshold = monitoring.endpoint_failed_major_threshold|float %}
{%- raw %}
server:
alert:
- NeutronAPIDown:
+ NeutronAPIOutage:
if: >-
- openstack_api_check_status{service=~"neutron.*"} == 0
- for: 2m
+ openstack_api_check_status{name="neutron"} == 0
labels:
- severity: down
- service: "{{ $labels.service }}"
+ severity: critical
+ service: neutron
annotations:
- summary: "Endpoint check for '{{ $labels.service }}' is down"
+ summary: "Neutron API outage"
description: >-
- Endpoint check for '{{ $labels.service }}' is down for 2 minutes
+ Neutron API is not accessible for the Neutron endpoint in the OpenStack service catalog.
NeutronAPIServiceDown:
if: >-
- http_response_status{service=~"neutron-api"} == 0
+ http_response_status{name="neutron-api"} == 0
for: 2m
labels:
- severity: down
- service: "{{ $labels.service }}"
+ severity: minor
+ service: neutron
annotations:
- summary: "HTTP check for '{{ $labels.service }}' down"
+ summary: "Host neutron-api endpoint is not accessible"
description: >-
- The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for 2 minutes.
+ The host neutron-api endpoint on the {{ $labels.host }} node is not accessible for at least 2 minutes.
{%- endraw %}
-{%- if server.get('backend', {}).engine is defined and server.backend.engine == "ml2" %}
-{%- raw %}
- NeutronSomeAgentsDown:
+ NeutronAPIServiceDownMajor:
if: >-
- openstack_neutron_agents{state="down"} > 0 and ignoring(state) openstack_neutron_agents{state="up"} >= 2
+ count(http_response_status{name="neutron-api"} == 0) >= count(http_response_status{name="neutron-api"}) * {{ major_threshold }}
for: 2m
labels:
- severity: warning
- service: "{{ $labels.service }}"
+ severity: major
+ service: neutron
annotations:
- summary: "Some {{ $labels.service }} agents down"
+ summary: "{{major_threshold * 100}}% of host neutron-api endpoints are not accessible"
description: >-
- {{ $value }} '{{ $labels.service }}' agent(s) is/are down for 2 minutes
- NeutronOnlyOneAgentUp:
+ {% raw %}{{ $value }} host neutron-api endpoints are not accessible for at least 2 minutes (at least {% endraw %}{{major_threshold * 100}}{% raw %}%).
+ NeutronAPIServiceOutage:
if: >-
- openstack_neutron_agents{state="up"} == 1 and ignoring(state) openstack_neutron_agents{state=~"down|disabled"} > 0
+ count(http_response_status{name="neutron-api"} == 0) == count(http_response_status{name="neutron-api"})
for: 2m
labels:
severity: critical
- service: "{{ $labels.service }}"
+ service: neutron
annotations:
- summary: "Only one {{ $labels.service }} agent up"
+ summary: "Host neutron-api outage"
description: >-
- Only one '{{ $labels.service }}' agent is up for 2 minutes
- NeutronAllAgentsDown:
- if: >-
- openstack_neutron_agents{state="up"} == 0
- for: 2m
- labels:
- severity: down
- service: "{{ $labels.service }}"
- annotations:
- summary: "All {{ $labels.service }} agents down"
- description: >-
- All '{{ $labels.service }}' agents are down for 2 minutes
- NeutronErrorLogsTooHigh:
+ All available host neutron-api endpoints are not accessible for at least 2 minutes.
{%- endraw %}
+{%- if server.get('backend', {}).engine is defined and server.backend.engine == "ml2" %}
+{%- set minor_threshold = monitoring.agents_failed_minor_threshold|float %}
+{%- set major_threshold = monitoring.agents_failed_major_threshold|float %}
+{%- raw %}
+ NeutronAgentDown:
+ if: >-
+ openstack_neutron_agent_state == 0
+ labels:
+ severity: minor
+ service: neutron
+ annotations:
+ summary: "{{ $labels.binary }} agent is down"
+ description: >-
+ The {{ $labels.binary }} agent on the {{ $labels.hostname }} node is down.
+{%- endraw %}
+ NeutronAgentsDownMinor:
+ if: >-
+ count(openstack_neutron_agent_state == 0) by (binary) >= on (binary) count(openstack_neutron_agent_state) by (binary) * {{minor_threshold}}
+ labels:
+ severity: minor
+ service: neutron
+ annotations:
+ summary: "{{minor_threshold * 100}}%{% raw %} of {{ $labels.binary }} agents are down"
+ description: >-
+ {{ $value }} {{ $labels.binary }} agents are down {% endraw %}(at least {{minor_threshold * 100}}%).
+ NeutronAgentsDownMajor:
+ if: >-
+ count(openstack_neutron_agent_state == 0) by (binary) >= on (binary) count(openstack_neutron_agent_state) by (binary) * {{major_threshold}}
+ labels:
+ severity: major
+ service: neutron
+ annotations:
+ summary: "{{major_threshold * 100}}%{% raw %} of {{ $labels.binary }} agents are down"
+ description: >-
+ {{ $value }} {{ $labels.binary }} agents are down {% endraw %}(at least {{major_threshold * 100}}%).
+{%- raw %}
+ NeutronAgentOutage:
+ if: >-
+ count(openstack_neutron_agent_state == 0) by (binary) == on (binary) count(openstack_neutron_agent_state) by (binary)
+ labels:
+ severity: critical
+ service: neutron
+ annotations:
+ summary: "{{ $labels.binary }} agent outage"
+ description: >-
+ All {{ $labels.binary }} agents are down.
+{%- endraw %}
+ NeutronErrorLogsTooHigh:
{%- set log_threshold = monitoring.error_log_rate|float %}
if: >-
sum(rate(log_messages{service="neutron",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > {{ log_threshold }}
{%- raw %}
labels:
severity: warning
- service: "{{ $labels.service }}"
+ service: neutron
annotations:
- summary: 'Too many errors in {{ $labels.service }} logs'
- description: 'The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ log_threshold }}).'
+ summary: "High number of errors in Neutron logs"
+ description: "The average per-second rate of errors in Neutron logs on the {{ $labels.host }} node is {{ $value }} (as measured over the last 5 minutes)."
+{%- endraw %}
{%- endif %}
{%- endif %}