Alerts reworked
Change alerts names, severity and descriptions.
Change-Id: I4b3efcaadf9e6f217a6821e441ae6cf8786604ea
Closes-bug: PROD-19878
diff --git a/opencontrail/meta/prometheus.yml b/opencontrail/meta/prometheus.yml
index 72a87d0..5fa9ec5 100644
--- a/opencontrail/meta/prometheus.yml
+++ b/opencontrail/meta/prometheus.yml
@@ -1,93 +1,6 @@
{%- if pillar.opencontrail is defined %}
{%- from "opencontrail/map.jinja" import control, collector, compute, config, database, web, monitoring with context %}
- {%- set all_contrail_processes = [] %}
- {%- set all_contrail_apis = [] %}
-
- {%- if collector.get('enabled', False) %}
- {%- set collector_apis = ( 'contrail.collector', ) %}
- {%- set collector_processes = (
- 'contrail-alarm-gen', 'contrail-analytics-api', 'contrail-collector',
- 'contrail-nodemgr', 'contrail-query-engine', 'contrail-snmp-collector',
- 'contrail-supervisord-analytics', 'contrail-topology',
- ) %}
-
- {%- for api in collector_apis %}
- {% do all_contrail_apis.append(api) %}
- {% endfor %}
-
- {%- for process in collector_processes %}
- {% do all_contrail_processes.append(process) %}
- {% endfor %}
- {%- endif %}
-
- {%- if compute.get('enabled', False) %}
- {%- set compute_apis = ( 'contrail.vrouter', 'contrail.node.manager' ) %}
- {%- set compute_processes = (
- 'contrail-nodemgr-vrouter', 'contrail-supervisord-vrouter', 'contrail-vrouter-agent'
- ) %}
-
- {%- for api in compute_apis %}
- {% do all_contrail_apis.append(api) %}
- {% endfor %}
-
- {%- for process in compute_processes %}
- {% do all_contrail_processes.append(process) %}
- {% endfor %}
- {%- endif %}
-
- {%- if control.get('enabled', False) %}
- {%- set control_apis = ( 'contrail.api', 'contrail.discovery' ) %}
-
- {%- set control_processes = (
- 'contrail-api', 'contrail-control', 'contrail-device-manager',
- 'contrail-discovery', 'contrail-dns', 'contrail-job-server',
- 'contrail-named', 'contrail-nodemgr-config',
- 'contrail-nodemgr-control', 'contrail-schema',
- 'contrail-supervisord-config', 'contrail-supervisord-control',
- 'contrail-svc-monitor',
- ) %}
-
- {%- if config.get('ifmap', {}).get('engine', 'irond') == 'irond' %}
- {%- set control_processes = control_processes + ('contrail-ifmap-server', 'contrail-irond',) %}
- {%- endif %}
-
- {%- for api in control_apis %}
- {% do all_contrail_apis.append(api) %}
- {% endfor %}
-
- {%- for process in control_processes %}
- {% do all_contrail_processes.append(process) %}
- {% endfor %}
- {%- endif %}
-
- {%- if database.get('enabled', False) %}
- {%- set database_processes = (
- 'kafka-server', 'cassandra-server',
- 'contrail-nodemgr-database', 'contrail-supervisord-database',
- ) %}
-
- {%- for process in database_processes %}
- {% do all_contrail_processes.append(process) %}
- {% endfor %}
- {%- endif %}
-
- {%- if web.get('enabled', False) %}
- {%- if web.get('cache', {}).get('engine', '') == 'redis' %}
- {%- set web_processes = (
- 'contrail-web-server', 'redis-server'
- ) %}
- {%- else %}
- {%- set web_processes = (
- 'contrail-web-server',
- ) %}
- {%- endif %}
-
- {%- for process in web_processes %}
- {% do all_contrail_processes.append(process) %}
- {% endfor %}
- {%- endif %}
-
{%- if database_processes is defined and
database.get('cassandra', False) and
exporters is defined %}
@@ -119,272 +32,264 @@
{%- endload %}
{{ new_exporters_cfg|yaml(False) }}
{%- endif %}
-
- {%- if control_processes is defined or
- collector_processes is defined or
- compute_processes is defined or
- database_processes is defined or
- web_processes is defined %}
server:
alert:
- {%- for contrail_api in all_contrail_apis %}
- {%- set words = contrail_api.split('.') %}
- {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIInfo:
+ ContrailAPIDown:
if: >-
- http_response_status{service=~"{{ contrail_api }}"} == 0
+ http_response_status{name=~"contrail.*"} == 0
{%- raw %}
for: 2m
labels:
- severity: info
- service: "{{ $labels.service }}"
+ severity: minor
+ service: contrail
annotations:
- summary: "Endpoint check for '{{ $labels.service }}' is failed"
- description: Endpoint check for '{{ $labels.service }}' is failed for 2 minutes on node {{ $labels.host }}
+ summary: "{{ $labels.name }} API endpoint is not accessible"
+ description: "The {{ $labels.name }} API endpoint on the {{ $labels.host }} node is not accessible for at least 2 minutes."
{%- endraw %}
- {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIWarning:
+ ContrailAPIDownMinor:
if: >-
- count(http_response_status{service=~"{{ contrail_api }}"} == 0) by (service) >= count(http_response_status{service=~"{{ contrail_api }}"}) by (service) *{{ monitoring.services_failed_warning_threshold_percent }}
+ count(http_response_status{name=~"contrail.*"} == 0) by (name) >= count(http_response_status{name=~"contrail.*"}) by (name) *{{ monitoring.services_failed_warning_threshold_percent }}
{%- raw %}
for: 2m
labels:
- severity: warning
- service: "{{ $labels.service }}"
+ severity: minor
+ service: contrail
annotations:
- summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
- description: More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down
+ summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.name }} API endpoints are not accessible"
+ description: "{{ $value }} {{ $labels.name }} API endpoints are not accessible (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})."
{%- endraw %}
- {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APICritical:
+ ContrailAPIDownMajor:
if: >-
- count(http_response_status{service=~"{{ contrail_api }}"} == 0) by (service) >= count(http_response_status{service=~"{{ contrail_api }}"}) by (service) *{{ monitoring.services_failed_critical_threshold_percent }}
+ count(http_response_status{name=~"contrail.*"} == 0) by (name) >= count(http_response_status{name=~"contrail.*"}) by (name) *{{ monitoring.services_failed_critical_threshold_percent }}
+ {%- raw %}
+ for: 2m
+ labels:
+ severity: major
+ service: contrail
+ annotations:
+ summary: "{%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.name }} API endpoints are not accessible"
+ description: "{{ $value }} {{ $labels.name }} API endpoints are not accessible (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})."
+ {%- endraw %}
+ ContrailAPIOutage:
+ if: >-
+ count(http_response_status{name=~"contrail.*"} == 0) by (name) == count(http_response_status{name=~"contrail.*"}) by (name)
{%- raw %}
for: 2m
labels:
severity: critical
- service: "{{ $labels.service }}"
+ service: contrail
annotations:
- summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
- description: More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down
+ summary: "{{ $labels.name }} API outage"
+ description: "The {{ $labels.name }} API is not accessible for all available endpoints."
{%- endraw %}
- {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIDown:
+ ContrailProcessDown:
if: >-
- count(http_response_status{service=~"{{ contrail_api }}"} == 0) by (service) == count(http_response_status{service=~"{{ contrail_api }}"}) by (service)
- {%- raw %}
- for: 2m
+ procstat_running{process_name=~"contrail.*"} == 0
labels:
- severity: down
- service: "{{ $labels.service }}"
- annotations:
- summary: "All '{{ $labels.service }}' APIs are down"
- description: All '{{ $labels.service }}' APIs are down
- {%- endraw %}
- {%- endfor %}
-{%- for contrail_process in all_contrail_processes %}
- {%- set words = contrail_process.split('-') %}
- {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessInfo:
- if: >-
- procstat_running{process_name="{{ contrail_process }}"} == 0
- labels:
- severity: info
- service: {{ contrail_process }}
+ severity: minor
+ service: contrail
annotations:
{%- raw %}
- summary: '{{ $labels.service }} service is down'
- description: '{{ $labels.service }} service is down on node {{ $labels.host }}'
+ summary: "{{ $labels.process_name }} process is down"
+ description: "The {{ $labels.process_name }} process on the {{ $labels.host }} node is down."
{%- endraw %}
- {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessWarning:
+ ContrailProcessDownMinor:
if: >-
- count(procstat_running{process_name="{{ contrail_process }}"} == 0) >= count(procstat_running{process_name="{{ contrail_process }}"}) *{{ monitoring.services_failed_warning_threshold_percent }}
+ count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) >= {{ monitoring.services_failed_warning_threshold_percent }}*count(procstat_running{process_name=~"contrail.*"}) by (process_name)
labels:
- severity: warning
- service: {{ contrail_process }}
+ severity: minor
+ service: contrail
annotations:
{%- raw %}
- summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
- description: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+ summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.process_name }} processes are down"
+ description: "{{ $value }} {{ $labels.process_name }} processes are down (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})."
{%- endraw %}
- {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessCritical:
+ ContrailProcessDownMajor:
if: >-
- count(procstat_running{process_name="{{ contrail_process }}"} == 0) >= count(procstat_running{process_name="{{ contrail_process }}"}) *{{ monitoring.services_failed_critical_threshold_percent }}
+ count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) >= {{ monitoring.services_failed_critical_threshold_percent }}*count(procstat_running{process_name=~"contrail.*"}) by (process_name)
+ labels:
+ severity: major
+ service: contrail
+ annotations:
+ {%- raw %}
+ summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.process_name }} processes are down"
+ description: "{{ $value }} {{ $labels.process_name }} processes are down (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})."
+ {%- endraw %}
+ ContrailProcessOutage:
+ if: >-
+ count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) == count(procstat_running{process_name=~"contrail.*"}) by (process_name)
labels:
severity: critical
- service: {{ contrail_process }}
+ service: contrail
annotations:
{%- raw %}
- summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
- description: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+ summary: "{{ $labels.name }} service outage"
+ description: "All {{ $labels.process_name }} processes are down."
{%- endraw %}
- {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessDown:
- if: >-
- count(procstat_running{process_name="{{ contrail_process }}"} == 0) == count(procstat_running{process_name="{{ contrail_process }}"})
- labels:
- severity: down
- service: {{ contrail_process }}
- annotations:
- {%- raw %}
- summary: "All '{{ $labels.service }}' services are down"
- description: "All '{{ $labels.service }}' services are down"
- {%- endraw %}
-{%- endfor %}
{%- if control_processes is defined %}
{%- raw %}
- ContrailBGPSessionsNoneUp:
- if: >-
- max(contrail_bgp_session_up_count) by (host) == 0
- for: 2m
- labels:
- severity: warning
- service: contrail-control
- annotations:
- summary: 'no active BGP sessions'
- description: 'There are no active BGP sessions on node {{ $labels.host }}'
- ContrailBGPSessionsSomeDown:
- if: >-
- min(contrail_bgp_session_down_count) by (host) > 0
- for: 2m
- labels:
- severity: warning
- service: contrail-control
- annotations:
- summary: 'inactive BGP sessions'
- description: 'There are inactive BGP sessions on node {{ $labels.host }}'
- ContrailBGPSessionsNone:
+ ContrailBGPSessionsNoEstablished:
if: >-
max(contrail_bgp_session_count) by (host) == 0
for: 2m
labels:
severity: warning
- service: contrail-control
+ service: contrail
annotations:
- summary: 'No BGP sessions'
- description: 'There are no BGP sessions on node {{ $labels.host }}'
- ContrailXMPPSessionsNoneUp:
+ summary: "No established BGP sessions"
+ description: "There are no established BGP sessions on the {{ $labels.host }} node for at least 2 minutes."
+ ContrailBGPSessionsNoActive:
if: >-
- sum(contrail_xmpp_session_up_count) < count(contrail_vrouter_xmpp) * 2
+ max(contrail_bgp_session_up_count) by (host) == 0
for: 2m
labels:
severity: warning
- service: contrail-control
+ service: contrail
annotations:
- summary: 'Unavailable established XMPP sessions'
- description: 'There is compute instance without established XMPP session'
- ContrailXMPPSessionsSomeDown:
+ summary: "No active BGP sessions"
+ description: "There are no active BGP sessions on the {{ $labels.host }} node for at least 2 minutes."
+ ContrailBGPSessionsDown:
+ if: >-
+ min(contrail_bgp_session_down_count) by (host) > 0
+ for: 2m
+ labels:
+ severity: warning
+ service: contrail
+ annotations:
+ summary: "BGP sessions are down"
+ description: "{{ $value }} BGP sessions on the {{ $labels.host }} node are down for at least 2 minutes."
+ ContrailXMPPSessionsMissingEstablished:
+ if: >-
+ count(contrail_vrouter_xmpp) * 2 - sum(contrail_xmpp_session_up_count) > 0
+ for: 2m
+ labels:
+ severity: warning
+ service: contrail
+ annotations:
+ summary: "Missing established XMPP sessions"
+ description: "{{ $value }} established XMPP sessions are missing on the compute cluster for at least 2 minutes."
+ ContrailXMPPSessionsMissing:
+ if: >-
+ count(contrail_vrouter_xmpp) * 2 - sum(contrail_xmpp_session_count) > 0
+ for: 2m
+ labels:
+ severity: warning
+ service: contrail
+ annotations:
+ summary: "Missing XMPP sessions"
+ description: "{{ $value }} XMPP sessions are missing on the compute cluster for at least 2 minutes."
+ ContrailXMPPSessionsDown:
if: >-
min(contrail_xmpp_session_down_count) by (host) > 0
for: 2m
labels:
severity: warning
- service: contrail-control
+ service: contrail
annotations:
- summary: 'inactive XMPP sessions'
- description: 'There are inactive XMPP sessions on node {{ $labels.host }}'
- ContrailXMPPSessionsNone:
- if: >-
- sum(contrail_xmpp_session_count) < count(contrail_vrouter_xmpp) * 2
- for: 2m
- labels:
- severity: warning
- service: contrail-control
- annotations:
- summary: 'Unavailable XMPP sessions'
- description: 'There is compute instance with missing XMPP session'
- ContrailXMPPSessionsTooMany:
- if: >-
+ summary: "XMPP sessions are down"
+ description: "{{ $value }} XMPP sessions on the {{ $labels.host }} node are down for at least 2 minutes."
{%- endraw %}
+ ContrailXMPPSessionsTooHigh:
+ if: >-
{%- set xmpp_toohigh_threshold = monitoring.xmpp_sessions_too_high_threshold %}
min(contrail_xmpp_session_count) by (host) >= {{ xmpp_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
- service: contrail-control
+ service: contrail
annotations:
- summary: 'Too many XMPP sessions'
- description: 'There are too many XMPP sessions on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ xmpp_toohigh_threshold }})'
- ContrailXMPPSessionsTooManyVariations:
+ summary: "XMPP sessions reached the limit of {%- endraw %} {{ xmpp_toohigh_threshold }}{%- raw %}"
+ description: "{{ $value }} XMPP sessions on the {{ $labels.host }} node are open for at least 2 minutes."
+{%- endraw %}
+ ContrailXMPPSessionsChangesTooHigh:
if: >-
{%- set xmpp_variation_threshold = monitoring.xmpp_sessions_variation_threshold %}
abs(delta(contrail_xmpp_session_count[2m])) >= {{ xmpp_variation_threshold }}
{%- raw %}
labels:
severity: warning
- service: contrail-control
+ service: contrail
annotations:
- summary: 'Number of XMPP sessions changed between checks is too high'
- description: 'There are too many XMPP sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ xmpp_variation_threshold }})'
+ summary: "XMPP sessions changes reached the limit of {%- endraw %}{{ xmpp_variation_threshold }}{%- raw %}"
+ description: "XMPP sessions on the {{ $labels.host }} node have changed {{ $value }} times."
+ {%- endraw %}
{%- endif %}
{%- if compute_processes is defined %}
{%- raw %}
- ContrailVrouterXMPPSessionsNone:
+ ContrailVrouterXMPPSessionsZero:
if: >-
- max(contrail_vrouter_xmpp) by (host) == 0
+ min(contrail_vrouter_xmpp) by (host) == 0
for: 2m
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'No vRouter XMPP sessions'
- description: 'There are no vRouter XMPP sessions on node {{ $labels.host }}'
- ContrailVrouterXMPPSessionsTooMany:
- if: >-
+ summary: "No vRouter XMPP sessions"
+ description: "There are no vRouter XMPP sessions on the {{ $labels.host }} node for at least 2 minutes."
{%- endraw %}
+ ContrailVrouterXMPPSessionsTooHigh:
+ if: >-
{%- set vrouter_xmpp_toohigh_threshold = monitoring.vrouter_xmpp_sessions_too_high_threshold %}
min(contrail_vrouter_xmpp) by (host) >= {{ vrouter_xmpp_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Too many vRouter XMPP sessions'
- description: 'There are too many vRouter XMPP sessions on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_xmpp_toohigh_threshold }})'
- ContrailVrouterXMPPSessionsTooManyVariations:
+ summary: "vRouter XMPP sessions reached the limit of {%- endraw %} {{ vrouter_xmpp_toohigh_threshold }}{%- raw %}"
+ description: "{{ $value }} vRouter XMPP sessions are open on the {{ $labels.host }} node for at least 2 minutes."
+{%- endraw %}
+ ContrailVrouterXMPPSessionsChangesTooHigh:
if: >-
{%- set vrouter_xmpp_variation_threshold = monitoring.vrouter_xmpp_sessions_variation_threshold %}
abs(delta(contrail_vrouter_xmpp[2m])) >= {{ vrouter_xmpp_variation_threshold }}
{%- raw %}
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Number of vRouter XMPP sessions changed between checks is too high'
- description: 'There are too many vRouter XMPP sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_xmpp_variation_threshold }})'
-{%- raw %}
- ContrailVrouterDNSXMPPSessionsNone:
+ summary: "vRouter XMPP sessions changes reached the limit of {%- endraw %}{{ vrouter_xmpp_variation_threshold }}{%- raw %}"
+ description: "vRouter XMPP sessions on the {{ $labels.host }} node have changed {{ $value }} times."
+ ContrailVrouterDNSXMPPSessionsZero:
if: >-
- max(contrail_vrouter_dns_xmpp) by (host) == 0
+ min(contrail_vrouter_dns_xmpp) by (host) == 0
for: 2m
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'No vRouter DNS-XMPP sessions'
- description: 'There are no vRouter DNS-XMPP sessions on node {{ $labels.host }}'
- ContrailVrouterDNSXMPPSessionsTooMany:
- if: >-
+ summary: "No vRouter DNS-XMPP sessions"
+ description: "There are no vRouter DNS-XMPP sessions on the {{ $labels.host }} node for at least 2 minutes."
{%- endraw %}
+ ContrailVrouterDNSXMPPSessionsTooHigh:
+ if: >-
{%- set vrouter_dns_xmpp_toohigh_threshold = monitoring.vrouter_dns_xmpp_sessions_too_high_threshold %}
min(contrail_vrouter_dns_xmpp) by (host) >= {{ vrouter_dns_xmpp_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Too many vRouter DNS-XMPP sessions'
- description: 'There are too many vRouter DNS-XMPP sessions on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_dns_xmpp_toohigh_threshold }})'
- ContrailVrouterDNSXMPPSessionsTooManyVariations:
+ summary: "vRouter DNS-XMPP sessions reached the limit of {%- endraw %} {{ vrouter_dns_xmpp_toohigh_threshold }}{%- raw %}"
+ description: "{{ $value }} vRouter DNS-XMPP sessions are open on the {{ $labels.host }} node for at least 2 minutes."
+{%- endraw %}
+ ContrailVrouterDNSXMPPSessionsChangesTooHigh:
if: >-
{%- set vrouter_dns_xmpp_variation_threshold = monitoring.vrouter_dns_xmpp_sessions_variation_threshold %}
abs(delta(contrail_vrouter_dns_xmpp[2m])) >= {{ vrouter_dns_xmpp_variation_threshold }}
{%- raw %}
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Number of vRouter DNS-XMPP sessions changed between checks is too high'
- description: 'There are too many vRouter DNS-XMPP sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_dns_xmpp_variation_threshold }})'
- ContrailVrouterLLSSessionsTooMany:
+ summary: "vRouter DNS-XMPP sessions changes reached the limit of {%- endraw %}{{ vrouter_dns_xmpp_variation_threshold }}{%- raw %}"
+ description: "vRouter DNS-XMPP sessions on the {{ $labels.host }} node have changed {{ $value }} times."
+{%- endraw %}
+ ContrailVrouterLLSSessionsTooHigh:
if: >-
{%- set vrouter_lls_toohigh_threshold = monitoring.vrouter_lls_too_high_threshold %}
min(contrail_vrouter_lls) by (host) >= {{ vrouter_lls_toohigh_threshold }}
@@ -392,22 +297,24 @@
for: 2m
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Too many vRouter LLS sessions'
- description: 'There are too many vRouter LLS sessions on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_lls_toohigh_threshold }})'
- ContrailVrouterLLSSessionsTooManyVariations:
+ summary: "vRouter LLS sessions reached the limit of {%- endraw %} {{ vrouter_lls_toohigh_threshold }}{%- raw %}"
+ description: "{{ $value }} vRouter LLS sessions are open on the {{ $labels.host }} node for at least 2 minutes."
+{%- endraw %}
+ ContrailVrouterLLSSessionsChangesTooHigh:
if: >-
{%- set vrouter_lls_variation_threshold = monitoring.vrouter_lls_variation_threshold %}
abs(delta(contrail_vrouter_lls[2m])) >= {{ vrouter_lls_variation_threshold }}
{%- raw %}
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Number of vRouter LLS sessions changed between checks is too high'
- description: 'There are too many vRouter LLS sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_lls_variation_threshold }})'
- ContrailFlowsActiveTooMany:
+ summary: "vRouter LLS sessions changes reached the limit of {%- endraw %} {{ vrouter_lls_variation_threshold }}{%- raw %}"
+ description: "vRouter LLS sessions on the {{ $labels.host }} node have changed {{ $value }} times."
+{%- endraw %}
+ ContrailFlowsActiveTooHigh:
if: >-
{%- set vrouter_flows_active_toohigh_threshold = monitoring.vrouter_flows_active_too_high_threshold %}
deriv(contrail_vrouter_flows_active[5m]) >= {{ vrouter_flows_active_toohigh_threshold }}
@@ -415,11 +322,12 @@
for: 2m
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Too many vRouter active flows'
- description: 'There are too many active vRouter flows on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_active_toohigh_threshold }})'
- ContrailFlowsDiscardTooMany:
+ summary: "vRouter active flows reached the limit of {%- endraw %} {{ vrouter_flows_active_toohigh_threshold }}{%- raw %}"
+ description: "{{ $value }} vRouter flows per second on the {{ $labels.host }} node are active for at least 2 minutes."
+{%- endraw %}
+ ContrailFlowsDiscardedTooHigh:
if: >-
{%- set vrouter_flows_discard_toohigh_threshold = monitoring.vrouter_flows_discard_too_high_threshold %}
rate(contrail_vrouter_flows_discard[5m]) >= {{ vrouter_flows_discard_toohigh_threshold }}
@@ -427,11 +335,12 @@
for: 2m
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Too many vRouter discarded flows'
- description: 'There are too many discarded vRouter flows on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_discard_toohigh_threshold }})'
- ContrailFlowsDropTooMany:
+ summary: "vRouter discarded flows reached the limit of {%- endraw %} {{ vrouter_flows_discard_toohigh_threshold }}{%- raw %}/s"
+ description: "An average per-second rate of discarded vRouter flows on the {{ $labels.host }} node is {{ $value }} for at least 2 minutes."
+{%- endraw %}
+ ContrailFlowsDroppedTooHigh:
enabled: false
if: >-
{%- set vrouter_flows_flow_action_drop_toohigh_threshold = monitoring.vrouter_flows_flow_action_drop_too_high_threshold %}
@@ -440,11 +349,12 @@
for: 2m
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Too many vRouter dropped flows'
- description: 'There are too many dropped vRouter flows on node {{ $labels.host }} (current value={{ $value }} flows/s, threshold={%- endraw %}{{ vrouter_flows_flow_action_drop_toohigh_threshold }} flows/s)'
- ContrailFlowsFragErrTooMany:
+ summary: "vRouter dropped flows reached the limit of {%- endraw %} {{ vrouter_flows_flow_action_drop_toohigh_threshold }}{%- raw %}/s"
+ description: "An average per-second rate of dropped vRouter flows on the {{ $labels.host }} node is {{ $value }} for at least 2 minutes."
+{%- endraw %}
+ ContrailFlowsFragErrTooHigh:
if: >-
{%- set vrouter_flows_frag_err_toohigh_threshold = monitoring.vrouter_flows_frag_err_too_high_threshold %}
min(contrail_vrouter_flows_frag_err) by (host) >= {{ vrouter_flows_frag_err_toohigh_threshold }}
@@ -452,11 +362,12 @@
for: 2m
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Too many vRouter flows with fragment errors'
- description: 'There are too many vRouter flows with fragment errors on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_frag_err_toohigh_threshold }})'
- ContrailFlowsInvalidNHTooMany:
+ summary: "vRouter flows with fragment errors reached the limit of {%- endraw %} {{ vrouter_flows_flow_action_drop_toohigh_threshold }}{%- raw %}"
+ description: "{{ $value }} vRouter flows on the {{ $labels.host }} node had fragment errors for at least 2 minutes."
+{%- endraw %}
+ ContrailFlowsNextHopInvalidTooHigh:
if: >-
{%- set vrouter_flows_invalid_nh_toohigh_threshold = monitoring.vrouter_flows_invalid_nh_too_high_threshold %}
rate(contrail_vrouter_flows_invalid_nh[5m]) >= {{ vrouter_flows_invalid_nh_toohigh_threshold }}
@@ -464,11 +375,12 @@
for: 2m
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Too many vRouter flows with invalid next hop'
- description: 'There are too many vRouter flows with invalid next hop on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_invalid_nh_toohigh_threshold }})'
- ContrailFlowsInvalidITFTooMany:
+ summary: "vRouter flows with an invalid next hop reached the limit of {%- endraw %} {{ vrouter_flows_invalid_nh_toohigh_threshold }}{%- raw %}/s"
+ description: "An average per-second rate of vRouter flows with an invalid next hop on the {{ $labels.host }} node is {{ $value }} for at least 2 minutes."
+{%- endraw %}
+ ContrailFlowsInterfaceInvalidTooHigh:
if: >-
{%- set vrouter_flows_composite_invalid_interface_toohigh_threshold = monitoring.vrouter_flows_composite_invalid_interface_too_high_threshold %}
rate(contrail_vrouter_flows_composite_invalid_interface[5m]) >= {{ vrouter_flows_composite_invalid_interface_toohigh_threshold }}
@@ -476,11 +388,12 @@
for: 2m
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Too many vRouter flows with composite invalid interface'
- description: 'There are too many vRouter flows with composite invalid interface on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_composite_invalid_interface_toohigh_threshold }})'
- ContrailFlowsInvalidLabelTooMany:
+ summary: "vRouter flows with an invalid composite interface reached the limit of {%- endraw %} {{ vrouter_flows_composite_invalid_interface_toohigh_threshold }}{%- raw %}/s"
+ description: "An average per-second rate of vRouter flows with an invalid composite interface on the {{ $labels.host }} node is {{ $value }} for at least 2 minutes."
+{%- endraw %}
+ ContrailFlowsLabelInvalidTooHigh:
if: >-
{%- set vrouter_flows_invalid_label_toohigh_threshold = monitoring.vrouter_flows_invalid_label_too_high_threshold %}
min(contrail_vrouter_flows_invalid_label) by (host) >= {{ vrouter_flows_invalid_label_toohigh_threshold }}
@@ -488,11 +401,12 @@
for: 2m
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Too many vRouter flows with invalid label'
- description: 'There are too many vRouter flows with invalid label on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_invalid_label_toohigh_threshold }})'
- ContrailFlowsQueueLimitExceededTooMany:
+ summary: "vRouter flows with an invalid label reached the limit of {%- endraw %} {{ vrouter_flows_invalid_label_toohigh_threshold }}{%- raw %}"
+ description: "{{ $value }} vRouter flows on the {{ $labels.host }} node had an invalid composite interface for at least 2 minutes."
+{%- endraw %}
+ ContrailFlowsQueueSizeExceededTooHigh:
if: >-
{%- set vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold = monitoring.vrouter_flows_flow_queue_limit_exceeded_too_high_threshold %}
rate(contrail_vrouter_flows_flow_queue_limit_exceeded[5m]) >= {{ vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold }}
@@ -500,11 +414,12 @@
for: 2m
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Too many vRouter flows with queue limit exceeded'
- description: 'There are too many vRouter flows with queue limit exceeded on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold }})'
- ContrailFlowsTableFullTooMany:
+ summary: "vRouter flows exceeding the queue size reached the limit of {%- endraw %} {{ vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold }}{%- raw %}/s"
+ description: "An average per-second rate of vRouter flows exceeding the queue size on the {{ $labels.host }} node is {{ $value }} for at least 2 minutes."
+{%- endraw %}
+ ContrailFlowsTableFullTooHigh:
if: >-
{%- set vrouter_flows_flow_table_full_toohigh_threshold = monitoring.vrouter_flows_flow_table_full_too_high_threshold %}
min(contrail_vrouter_flows_flow_table_full) by (host) >= {{ vrouter_flows_flow_table_full_toohigh_threshold }}
@@ -512,13 +427,136 @@
for: 2m
labels:
severity: warning
- service: contrail-compute
+ service: contrail
annotations:
- summary: 'Too many vRouter flows with table full'
- description: 'There are too many vRouter flows with table full on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_flow_table_full_toohigh_threshold }})'
+ summary: "vRouter flows with full table reached the limit of {%- endraw %} {{ vrouter_flows_flow_table_full_toohigh_threshold }}{%- raw %}"
+ description: "{{ $value }} vRouter flows on the {{ $labels.host }} node had a full table for at least 2 minutes."
+{%- endraw %}
{%- endif %}
-
+ {%- if web.get('enabled', False) and web.get('cache', {}).get('engine', '') == 'redis' %}
+ {%- raw %}
+ RedisServiceDown:
+ if: >-
+ procstat_running{process_name="redis-server"} == 0
+ labels:
+ severity: minor
+ service: redis
+ annotations:
+ summary: "Redis service is down"
+ description: "The Redis service on the {{ $labels.host }} node is down."
+ {%- endraw %}
+ RedisServiceDownMinor:
+ if: >-
+ count(procstat_running{process_name="redis-server"} == 0) >= count(procstat_running{process_name="redis-server"}) *{{ monitoring.services_failed_warning_threshold_percent }}
+ {%- raw %}
+ labels:
+ severity: minor
+ service: redis
+ annotations:
+ summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of Redis services are down"
+ description: "{{ $value }} Redis services are down (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%)."
+ RedisServiceDownMajor:
+ if: >-
+ count(procstat_running{process_name="redis-server"} == 0) >= count(procstat_running{process_name="redis-server"}) *{{ monitoring.services_failed_critical_threshold_percent }}
+ {%- raw %}
+ labels:
+ severity: major
+ service: redis
+ annotations:
+ summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of Redis services are down"
+ description: "{{ $value }} Redis services are down (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})."
+ RedisServiceOutage:
+ if: >-
+ count(procstat_running{process_name="redis-server"} == 0) == count(procstat_running{process_name="redis-server"})
+ labels:
+ severity: critical
+ service: redis
+ annotations:
+ summary: "Redis service outage"
+ description: "All Redis services are down."
+ {%- endraw %}
+ {%- endif %}
{%- if database.get('enabled', False) %}
+ {%- raw %}
+ CassandraServiceDown:
+ if: >-
+ procstat_running{process_name="cassandra-server"} == 0
+ labels:
+ severity: minor
+ service: cassandra
+ annotations:
+ summary: "Cassandra service is down"
+ description: "The Cassandra service on the {{ $labels.host }} node is down."
+ {%- endraw %}
+ CassandraServiceDownMinor:
+ if: >-
+ count(procstat_running{process_name="cassandra-server"} == 0) >= count(procstat_running{process_name="cassandra-server"}) *{{ monitoring.services_failed_warning_threshold_percent }}
+ {%- raw %}
+ labels:
+ severity: minor
+ service: cassandra
+ annotations:
+ summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of Cassandra services are down"
+ description: "{{ $value }} Cassandra services are down (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%)."
+ CassandraServiceDownMajor:
+ if: >-
+ count(procstat_running{process_name="cassandra-server"} == 0) >= count(procstat_running{process_name="cassandra-server"}) *{{ monitoring.services_failed_critical_threshold_percent }}
+ {%- raw %}
+ labels:
+ severity: major
+ service: cassandra
+ annotations:
+ summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of Cassandra services are down"
+ description: "{{ $value }} Cassandra services are down (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})."
+ CassandraServiceOutage:
+ if: >-
+ count(procstat_running{process_name="cassandra-server"} == 0) == count(procstat_running{process_name="cassandra-server"})
+ labels:
+ severity: critical
+ service: cassandra
+ annotations:
+ summary: "Cassandra service outage"
+ description: "All Cassandra services are down."
+ KafkaServiceDown:
+ if: >-
+ procstat_running{process_name="kafka-server"} == 0
+ labels:
+ severity: minor
+ service: kafka
+ annotations:
+ summary: "Kafka service is down"
+ description: "The Kafka service on the {{ $labels.host }} node is down."
+ {%- endraw %}
+ KafkaServiceDownMinor:
+ if: >-
+ count(procstat_running{process_name="kafka-server"} == 0) >= count(procstat_running{process_name="kafka-server"}) *{{ monitoring.services_failed_warning_threshold_percent }}
+ labels:
+ severity: minor
+ service: kafka
+ annotations:
+ {%- raw %}
+ summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of Kafka services are down"
+ description: "{{ $value }} Kafka services are down (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})."
+ {%- endraw %}
+ KafkaServiceDownMajor:
+ if: >-
+ count(procstat_running{process_name="kafka-server"} == 0) >= count(procstat_running{process_name="kafka-server"}) *{{ monitoring.services_failed_critical_threshold_percent }}
+ {%- raw %}
+ labels:
+ severity: major
+ service: kafka
+ annotations:
+ summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of Kafka services are down"
+ description: "{{ $value }} Kafka services are down (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})."
+ KafkaServiceOutage:
+ if: >-
+ count(procstat_running{process_name="kafka-server"} == 0) == count(procstat_running{process_name="kafka-server"})
+ labels:
+ severity: critical
+ service: kafka
+ annotations:
+ summary: "Kafka service outage"
+ description: "All Kafka services are down."
ZookeeperServiceDown:
if: >-
zookeeper_up == 0
@@ -529,7 +567,7 @@
annotations:
summary: "Zookeeper service is down"
description: "The Zookeeper service on the {% raw %}{{ $labels.host }}{% endraw %} node is down for at least 2 minutes."
- ZookeeperServiceError:
+ ZookeeperServiceErrorWarning:
if: >-
zookeeper_service_health == 0
for: 2m
@@ -574,6 +612,4 @@
{%- if exporters is defined %}
{%- include "prometheus/_exporters_config.sls" %}
{%- endif %}
- {%- endif %}
{%- endif %}
-