Alerts reworked

Change alerts names, severity and descriptions.

Change-Id: I4b3efcaadf9e6f217a6821e441ae6cf8786604ea
Closes-bug: PROD-19878
diff --git a/opencontrail/meta/prometheus.yml b/opencontrail/meta/prometheus.yml
index 72a87d0..5fa9ec5 100644
--- a/opencontrail/meta/prometheus.yml
+++ b/opencontrail/meta/prometheus.yml
@@ -1,93 +1,6 @@
 {%- if pillar.opencontrail is defined %}
   {%- from "opencontrail/map.jinja" import control, collector, compute, config, database, web, monitoring with context %}
 
-  {%- set all_contrail_processes = [] %}
-  {%- set all_contrail_apis = [] %}
-
-  {%- if collector.get('enabled', False) %}
-    {%- set collector_apis = ( 'contrail.collector', ) %}
-    {%- set collector_processes = (
-        'contrail-alarm-gen', 'contrail-analytics-api', 'contrail-collector',
-        'contrail-nodemgr', 'contrail-query-engine', 'contrail-snmp-collector',
-        'contrail-supervisord-analytics', 'contrail-topology',
-        ) %}
-
-    {%- for api in collector_apis %}
-      {% do all_contrail_apis.append(api) %}
-    {% endfor %}
-
-    {%- for process in collector_processes %}
-      {% do all_contrail_processes.append(process) %}
-    {% endfor %}
-  {%- endif %}
-
-  {%- if compute.get('enabled', False) %}
-    {%- set compute_apis = ( 'contrail.vrouter', 'contrail.node.manager' ) %}
-    {%- set compute_processes = (
-        'contrail-nodemgr-vrouter', 'contrail-supervisord-vrouter', 'contrail-vrouter-agent'
-        ) %}
-
-    {%- for api in compute_apis %}
-      {% do all_contrail_apis.append(api) %}
-    {% endfor %}
-
-    {%- for process in compute_processes %}
-      {% do all_contrail_processes.append(process) %}
-    {% endfor %}
-  {%- endif %}
-
-  {%- if control.get('enabled', False) %}
-    {%- set control_apis = ( 'contrail.api', 'contrail.discovery' ) %}
-
-    {%- set control_processes = (
-    'contrail-api', 'contrail-control', 'contrail-device-manager',
-    'contrail-discovery', 'contrail-dns', 'contrail-job-server',
-    'contrail-named', 'contrail-nodemgr-config',
-    'contrail-nodemgr-control', 'contrail-schema',
-    'contrail-supervisord-config', 'contrail-supervisord-control',
-    'contrail-svc-monitor',
-    ) %}
-
-    {%- if config.get('ifmap', {}).get('engine', 'irond') == 'irond' %}
-      {%- set control_processes = control_processes + ('contrail-ifmap-server', 'contrail-irond',) %}
-    {%- endif %}
-
-    {%- for api in control_apis %}
-      {% do all_contrail_apis.append(api) %}
-    {% endfor %}
-
-    {%- for process in control_processes %}
-      {% do all_contrail_processes.append(process) %}
-    {% endfor %}
-  {%- endif %}
-
-  {%- if database.get('enabled', False) %}
-    {%- set database_processes = (
-        'kafka-server', 'cassandra-server',
-        'contrail-nodemgr-database', 'contrail-supervisord-database',
-        ) %}
-
-    {%- for process in database_processes %}
-      {% do all_contrail_processes.append(process) %}
-    {% endfor %}
-  {%- endif %}
-
-  {%- if web.get('enabled', False) %}
-    {%- if web.get('cache', {}).get('engine', '') == 'redis' %}
-      {%- set web_processes = (
-          'contrail-web-server', 'redis-server'
-          ) %}
-    {%- else %}
-      {%- set web_processes = (
-          'contrail-web-server',
-          ) %}
-    {%- endif %}
-
-    {%- for process in web_processes %}
-      {% do all_contrail_processes.append(process) %}
-    {% endfor %}
-  {%- endif %}
-
   {%- if database_processes is defined and
          database.get('cassandra', False) and
          exporters is defined %}
@@ -119,272 +32,264 @@
     {%- endload %}
 {{ new_exporters_cfg|yaml(False) }}
   {%- endif %}
-
-  {%- if control_processes is defined or
-         collector_processes is defined or
-         compute_processes is defined or
-         database_processes is defined or
-         web_processes is defined %}
 server:
   alert:
-    {%- for contrail_api in all_contrail_apis %}
-      {%- set words = contrail_api.split('.') %}
-    {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIInfo:
+    ContrailAPIDown:
       if: >-
-        http_response_status{service=~"{{ contrail_api }}"} == 0
+        http_response_status{name=~"contrail.*"} == 0
       {%- raw %}
       for: 2m
       labels:
-        severity: info
-        service: "{{ $labels.service }}"
+        severity: minor
+        service: contrail
       annotations:
-        summary: "Endpoint check for '{{ $labels.service }}' is failed"
-        description: Endpoint check for '{{ $labels.service }}' is failed for 2 minutes on node {{ $labels.host }}
+        summary: "{{ $labels.name }} API endpoint is not accessible"
+        description: "The {{ $labels.name }} API endpoint on the {{ $labels.host }} node is not accessible for at least 2 minutes."
       {%- endraw %}
-    {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIWarning:
+    ContrailAPIDownMinor:
       if: >-
-        count(http_response_status{service=~"{{ contrail_api }}"} == 0) by (service) >= count(http_response_status{service=~"{{ contrail_api }}"}) by (service) *{{ monitoring.services_failed_warning_threshold_percent }}
+        count(http_response_status{name=~"contrail.*"} == 0) by (name) >= count(http_response_status{name=~"contrail.*"}) by (name) *{{ monitoring.services_failed_warning_threshold_percent }}
       {%- raw %}
       for: 2m
       labels:
-        severity: warning
-        service: "{{ $labels.service }}"
+        severity: minor
+        service: contrail
       annotations:
-        summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
-        description: More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down
+        summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.name }} API endpoints are not accessible"
+        description: "{{ $value }} {{ $labels.name }} API endpoints are not accessible (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})."
       {%- endraw %}
-    {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APICritical:
+    ContrailAPIDownMajor:
       if: >-
-        count(http_response_status{service=~"{{ contrail_api }}"} == 0) by (service) >= count(http_response_status{service=~"{{ contrail_api }}"}) by (service) *{{ monitoring.services_failed_critical_threshold_percent }}
+        count(http_response_status{name=~"contrail.*"} == 0) by (name) >= count(http_response_status{name=~"contrail.*"}) by (name) *{{ monitoring.services_failed_critical_threshold_percent }}
+      {%- raw %}
+      for: 2m
+      labels:
+        severity: major
+        service: contrail
+      annotations:
+        summary: "{%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.name }} API endpoints are not accessible"
+        description: "{{ $value }} {{ $labels.name }} API endpoints are not accessible (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})."
+      {%- endraw %}
+    ContrailAPIOutage:
+      if: >-
+        count(http_response_status{name=~"contrail.*"} == 0) by (name) == count(http_response_status{name=~"contrail.*"}) by (name)
       {%- raw %}
       for: 2m
       labels:
         severity: critical
-        service: "{{ $labels.service }}"
+        service: contrail
       annotations:
-        summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
-        description: More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down
+        summary: "{{ $labels.name }} API outage"
+        description: "The {{ $labels.name }} API is not accessible for all available endpoints."
       {%- endraw %}
-    {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIDown:
+    ContrailProcessDown:
       if: >-
-        count(http_response_status{service=~"{{ contrail_api }}"} == 0) by (service) == count(http_response_status{service=~"{{ contrail_api }}"}) by (service)
-      {%- raw %}
-      for: 2m
+        procstat_running{process_name=~"contrail.*"} == 0
       labels:
-        severity: down
-        service: "{{ $labels.service }}"
-      annotations:
-        summary: "All '{{ $labels.service }}' APIs are down"
-        description: All '{{ $labels.service }}' APIs are down
-      {%- endraw %}
-    {%- endfor %}
-{%- for contrail_process in all_contrail_processes %}
-      {%- set words = contrail_process.split('-') %}
-    {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessInfo:
-      if: >-
-        procstat_running{process_name="{{ contrail_process }}"} == 0
-      labels:
-        severity: info
-        service: {{ contrail_process }}
+        severity: minor
+        service: contrail
       annotations:
     {%- raw %}
-        summary: '{{ $labels.service }} service is down'
-        description: '{{ $labels.service }} service is down on node {{ $labels.host }}'
+        summary: "{{ $labels.process_name }} process is down"
+        description: "The {{ $labels.process_name }} process on the {{ $labels.host }} node is down."
     {%- endraw %}
-    {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessWarning:
+    ContrailProcessDownMinor:
       if: >-
-        count(procstat_running{process_name="{{ contrail_process }}"} == 0) >= count(procstat_running{process_name="{{ contrail_process }}"}) *{{ monitoring.services_failed_warning_threshold_percent }}
+        count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) >= {{ monitoring.services_failed_warning_threshold_percent }}*count(procstat_running{process_name=~"contrail.*"}) by (process_name)
       labels:
-        severity: warning
-        service: {{ contrail_process }}
+        severity: minor
+        service: contrail
       annotations:
     {%- raw %}
-        summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
-        description: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+        summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.process_name }} processes are down"
+        description: "{{ $value }} {{ $labels.process_name }} processes are down (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})."
     {%- endraw %}
-    {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessCritical:
+    ContrailProcessDownMajor:
       if: >-
-        count(procstat_running{process_name="{{ contrail_process }}"} == 0) >= count(procstat_running{process_name="{{ contrail_process }}"}) *{{ monitoring.services_failed_critical_threshold_percent }}
+        count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) >= {{ monitoring.services_failed_critical_threshold_percent }}*count(procstat_running{process_name=~"contrail.*"}) by (process_name)
+      labels:
+        severity: major
+        service: contrail
+      annotations:
+    {%- raw %}
+        summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.process_name }} processes are down"
+        description: "{{ $value }} {{ $labels.process_name }} processes are down (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})."
+    {%- endraw %}
+    ContrailProcessOutage:
+      if: >-
+        count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) == count(procstat_running{process_name=~"contrail.*"}) by (process_name)
       labels:
         severity: critical
-        service: {{ contrail_process }}
+        service: contrail
       annotations:
     {%- raw %}
-        summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
-        description: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+        summary: "{{ $labels.name }} service outage"
+        description: "All {{ $labels.process_name }} processes are down."
     {%- endraw %}
-    {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessDown:
-      if: >-
-        count(procstat_running{process_name="{{ contrail_process }}"} == 0) == count(procstat_running{process_name="{{ contrail_process }}"})
-      labels:
-        severity: down
-        service: {{ contrail_process }}
-      annotations:
-    {%- raw %}
-        summary: "All '{{ $labels.service }}' services are down"
-        description: "All '{{ $labels.service }}' services are down"
-    {%- endraw %}
-{%- endfor %}
     {%- if control_processes is defined %}
 {%- raw %}
-    ContrailBGPSessionsNoneUp:
-      if: >-
-        max(contrail_bgp_session_up_count) by (host) == 0
-      for: 2m
-      labels:
-        severity: warning
-        service: contrail-control
-      annotations:
-        summary: 'no active BGP sessions'
-        description: 'There are no active BGP sessions on node {{ $labels.host }}'
-    ContrailBGPSessionsSomeDown:
-      if: >-
-        min(contrail_bgp_session_down_count) by (host) > 0
-      for: 2m
-      labels:
-        severity: warning
-        service: contrail-control
-      annotations:
-        summary: 'inactive BGP sessions'
-        description: 'There are inactive BGP sessions on node {{ $labels.host }}'
-    ContrailBGPSessionsNone:
+    ContrailBGPSessionsNoEstablished:
       if: >-
         max(contrail_bgp_session_count) by (host) == 0
       for: 2m
       labels:
         severity: warning
-        service: contrail-control
+        service: contrail
       annotations:
-        summary: 'No BGP sessions'
-        description: 'There are no BGP sessions on node {{ $labels.host }}'
-    ContrailXMPPSessionsNoneUp:
+        summary: "No established BGP sessions"
+        description: "There are no established BGP sessions on the {{ $labels.host }} node for at least 2 minutes."
+    ContrailBGPSessionsNoActive:
       if: >-
-        sum(contrail_xmpp_session_up_count) < count(contrail_vrouter_xmpp) * 2
+        max(contrail_bgp_session_up_count) by (host) == 0
       for: 2m
       labels:
         severity: warning
-        service: contrail-control
+        service: contrail
       annotations:
-        summary: 'Unavailable established XMPP sessions'
-        description: 'There is compute instance without established XMPP session'
-    ContrailXMPPSessionsSomeDown:
+        summary: "No active BGP sessions"
+        description: "There are no active BGP sessions on the {{ $labels.host }} node for at least 2 minutes."
+    ContrailBGPSessionsDown:
+      if: >-
+        min(contrail_bgp_session_down_count) by (host) > 0
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail
+      annotations:
+        summary: "BGP sessions are down"
+        description: "{{ $value }} BGP sessions on the {{ $labels.host }} node are down for at least 2 minutes."
+    ContrailXMPPSessionsMissingEstablished:
+      if: >-
+        count(contrail_vrouter_xmpp) * 2 - sum(contrail_xmpp_session_up_count) > 0
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail
+      annotations:
+        summary: "Missing established XMPP sessions"
+        description: "{{ $value }} established XMPP sessions are missing on the compute cluster for at least 2 minutes."
+    ContrailXMPPSessionsMissing:
+      if: >-
+        count(contrail_vrouter_xmpp) * 2 - sum(contrail_xmpp_session_count) > 0
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail
+      annotations:
+        summary: "Missing XMPP sessions"
+        description: "{{ $value }} XMPP sessions are missing on the compute cluster for at least 2 minutes."
+    ContrailXMPPSessionsDown:
       if: >-
         min(contrail_xmpp_session_down_count) by (host) > 0
       for: 2m
       labels:
         severity: warning
-        service: contrail-control
+        service: contrail
       annotations:
-        summary: 'inactive XMPP sessions'
-        description: 'There are inactive XMPP sessions on node {{ $labels.host }}'
-    ContrailXMPPSessionsNone:
-      if: >-
-        sum(contrail_xmpp_session_count) < count(contrail_vrouter_xmpp) * 2
-      for: 2m
-      labels:
-        severity: warning
-        service: contrail-control
-      annotations:
-        summary: 'Unavailable XMPP sessions'
-        description: 'There is compute instance with missing XMPP session'
-    ContrailXMPPSessionsTooMany:
-      if: >-
+        summary: "XMPP sessions are down"
+        description: "{{ $value }} XMPP sessions on the {{ $labels.host }} node are down for at least 2 minutes."
 {%- endraw %}
+    ContrailXMPPSessionsTooHigh:
+      if: >-
     {%- set xmpp_toohigh_threshold = monitoring.xmpp_sessions_too_high_threshold %}
         min(contrail_xmpp_session_count) by (host) >= {{ xmpp_toohigh_threshold }}
 {%- raw %}
       for: 2m
       labels:
         severity: warning
-        service: contrail-control
+        service: contrail
       annotations:
-        summary: 'Too many XMPP sessions'
-        description: 'There are too many XMPP sessions on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ xmpp_toohigh_threshold }})'
-    ContrailXMPPSessionsTooManyVariations:
+        summary: "XMPP sessions reached the limit of {%- endraw %} {{ xmpp_toohigh_threshold }}{%- raw %}"
+        description: "{{ $value }} XMPP sessions on the {{ $labels.host }} node are open for at least 2 minutes."
+{%- endraw %}
+    ContrailXMPPSessionsChangesTooHigh:
       if: >-
     {%- set xmpp_variation_threshold = monitoring.xmpp_sessions_variation_threshold %}
         abs(delta(contrail_xmpp_session_count[2m])) >= {{ xmpp_variation_threshold }}
 {%- raw %}
       labels:
         severity: warning
-        service: contrail-control
+        service: contrail
       annotations:
-        summary: 'Number of XMPP sessions changed between checks is too high'
-        description: 'There are too many XMPP sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ xmpp_variation_threshold }})'
+        summary: "XMPP sessions changes reached the limit of {%- endraw %}{{ xmpp_variation_threshold }}{%- raw %}"
+        description: "XMPP sessions on the {{ $labels.host }} node have changed {{ $value }} times."
+      {%- endraw %}
     {%- endif %}
 
     {%- if compute_processes is defined %}
 {%- raw %}
-    ContrailVrouterXMPPSessionsNone:
+    ContrailVrouterXMPPSessionsZero:
       if: >-
-        max(contrail_vrouter_xmpp) by (host) == 0
+        min(contrail_vrouter_xmpp) by (host) == 0
       for: 2m
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'No vRouter XMPP sessions'
-        description: 'There are no vRouter XMPP sessions on node {{ $labels.host }}'
-    ContrailVrouterXMPPSessionsTooMany:
-      if: >-
+        summary: "No vRouter XMPP sessions"
+        description: "There are no vRouter XMPP sessions on the {{ $labels.host }} node for at least 2 minutes."
 {%- endraw %}
+    ContrailVrouterXMPPSessionsTooHigh:
+      if: >-
     {%- set vrouter_xmpp_toohigh_threshold = monitoring.vrouter_xmpp_sessions_too_high_threshold %}
         min(contrail_vrouter_xmpp) by (host) >= {{ vrouter_xmpp_toohigh_threshold }}
 {%- raw %}
       for: 2m
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Too many vRouter XMPP sessions'
-        description: 'There are too many vRouter XMPP sessions on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_xmpp_toohigh_threshold }})'
-    ContrailVrouterXMPPSessionsTooManyVariations:
+        summary: "vRouter XMPP sessions reached the limit of {%- endraw %} {{ vrouter_xmpp_toohigh_threshold }}{%- raw %}"
+        description: "{{ $value }} vRouter XMPP sessions are open on the {{ $labels.host }} node for at least 2 minutes."
+{%- endraw %}
+    ContrailVrouterXMPPSessionsChangesTooHigh:
       if: >-
     {%- set vrouter_xmpp_variation_threshold = monitoring.vrouter_xmpp_sessions_variation_threshold %}
         abs(delta(contrail_vrouter_xmpp[2m])) >= {{ vrouter_xmpp_variation_threshold }}
 {%- raw %}
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Number of vRouter XMPP sessions changed between checks is too high'
-        description: 'There are too many vRouter XMPP sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_xmpp_variation_threshold }})'
-{%- raw %}
-    ContrailVrouterDNSXMPPSessionsNone:
+        summary: "vRouter XMPP sessions changes reached the limit of {%- endraw %}{{ vrouter_xmpp_variation_threshold }}{%- raw %}"
+        description: "vRouter XMPP sessions on the {{ $labels.host }} node have changed {{ $value }} times."
+    ContrailVrouterDNSXMPPSessionsZero:
       if: >-
-        max(contrail_vrouter_dns_xmpp) by (host) == 0
+        min(contrail_vrouter_dns_xmpp) by (host) == 0
       for: 2m
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'No vRouter DNS-XMPP sessions'
-        description: 'There are no vRouter DNS-XMPP sessions on node {{ $labels.host }}'
-    ContrailVrouterDNSXMPPSessionsTooMany:
-      if: >-
+        summary: "No vRouter DNS-XMPP sessions"
+        description: "There are no vRouter DNS-XMPP sessions on the {{ $labels.host }} node for at least 2 minutes."
 {%- endraw %}
+    ContrailVrouterDNSXMPPSessionsTooHigh:
+      if: >-
     {%- set vrouter_dns_xmpp_toohigh_threshold = monitoring.vrouter_dns_xmpp_sessions_too_high_threshold %}
         min(contrail_vrouter_dns_xmpp) by (host) >= {{ vrouter_dns_xmpp_toohigh_threshold }}
 {%- raw %}
       for: 2m
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Too many vRouter DNS-XMPP sessions'
-        description: 'There are too many vRouter DNS-XMPP sessions on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_dns_xmpp_toohigh_threshold }})'
-    ContrailVrouterDNSXMPPSessionsTooManyVariations:
+        summary: "vRouter DNS-XMPP sessions reached the limit of {%- endraw %} {{ vrouter_dns_xmpp_toohigh_threshold }}{%- raw %}"
+        description: "{{ $value }} vRouter DNS-XMPP sessions are open on the {{ $labels.host }} node for at least 2 minutes."
+{%- endraw %}
+    ContrailVrouterDNSXMPPSessionsChangesTooHigh:
       if: >-
     {%- set vrouter_dns_xmpp_variation_threshold = monitoring.vrouter_dns_xmpp_sessions_variation_threshold %}
         abs(delta(contrail_vrouter_dns_xmpp[2m])) >= {{ vrouter_dns_xmpp_variation_threshold }}
 {%- raw %}
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Number of vRouter DNS-XMPP sessions changed between checks is too high'
-        description: 'There are too many vRouter DNS-XMPP sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_dns_xmpp_variation_threshold }})'
-    ContrailVrouterLLSSessionsTooMany:
+        summary: "vRouter DNS-XMPP sessions changes reached the limit of {%- endraw %}{{ vrouter_dns_xmpp_variation_threshold }}{%- raw %}"
+        description: "vRouter DNS-XMPP sessions on the {{ $labels.host }} node have changed {{ $value }} times."
+{%- endraw %}
+    ContrailVrouterLLSSessionsTooHigh:
       if: >-
     {%- set vrouter_lls_toohigh_threshold = monitoring.vrouter_lls_too_high_threshold %}
         min(contrail_vrouter_lls) by (host) >= {{ vrouter_lls_toohigh_threshold }}
@@ -392,22 +297,24 @@
       for: 2m
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Too many vRouter LLS sessions'
-        description: 'There are too many vRouter LLS sessions on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_lls_toohigh_threshold }})'
-    ContrailVrouterLLSSessionsTooManyVariations:
+        summary: "vRouter LLS sessions reached the limit of {%- endraw %} {{ vrouter_lls_toohigh_threshold }}{%- raw %}"
+        description: "{{ $value }} vRouter LLS sessions are open on the {{ $labels.host }} node for at least 2 minutes."
+{%- endraw %}
+    ContrailVrouterLLSSessionsChangesTooHigh:
       if: >-
     {%- set vrouter_lls_variation_threshold = monitoring.vrouter_lls_variation_threshold %}
         abs(delta(contrail_vrouter_lls[2m])) >= {{ vrouter_lls_variation_threshold }}
 {%- raw %}
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Number of vRouter LLS sessions changed between checks is too high'
-        description: 'There are too many vRouter LLS sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_lls_variation_threshold }})'
-    ContrailFlowsActiveTooMany:
+        summary: "vRouter LLS sessions changes reached the limit of {%- endraw %} {{ vrouter_lls_variation_threshold }}{%- raw %}"
+        description: "vRouter LLS sessions on the {{ $labels.host }} node have changed {{ $value }} times."
+{%- endraw %}
+    ContrailFlowsActiveTooHigh:
       if: >-
     {%- set vrouter_flows_active_toohigh_threshold = monitoring.vrouter_flows_active_too_high_threshold %}
         deriv(contrail_vrouter_flows_active[5m]) >= {{ vrouter_flows_active_toohigh_threshold }}
@@ -415,11 +322,12 @@
       for: 2m
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Too many vRouter active flows'
-        description: 'There are too many active vRouter flows on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_active_toohigh_threshold }})'
-    ContrailFlowsDiscardTooMany:
+        summary: "vRouter active flows reached the limit of {%- endraw %} {{ vrouter_flows_active_toohigh_threshold }}{%- raw %}"
+        description: "{{ $value }} vRouter flows per second on the {{ $labels.host }} node are active for at least 2 minutes."
+{%- endraw %}
+    ContrailFlowsDiscardedTooHigh:
       if: >-
     {%- set vrouter_flows_discard_toohigh_threshold = monitoring.vrouter_flows_discard_too_high_threshold %}
         rate(contrail_vrouter_flows_discard[5m]) >= {{ vrouter_flows_discard_toohigh_threshold }}
@@ -427,11 +335,12 @@
       for: 2m
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Too many vRouter discarded flows'
-        description: 'There are too many discarded vRouter flows on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_discard_toohigh_threshold }})'
-    ContrailFlowsDropTooMany:
+        summary: "vRouter discarded flows reached the limit of {%- endraw %} {{ vrouter_flows_discard_toohigh_threshold }}{%- raw %}/s"
+        description: "An average per-second rate of discarded vRouter flows on the {{ $labels.host }} node is {{ $value }} for at least 2 minutes."
+{%- endraw %}
+    ContrailFlowsDroppedTooHigh:
       enabled: false
       if: >-
     {%- set vrouter_flows_flow_action_drop_toohigh_threshold = monitoring.vrouter_flows_flow_action_drop_too_high_threshold %}
@@ -440,11 +349,12 @@
       for: 2m
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Too many vRouter dropped flows'
-        description: 'There are too many dropped vRouter flows on node {{ $labels.host }} (current value={{ $value }} flows/s, threshold={%- endraw %}{{ vrouter_flows_flow_action_drop_toohigh_threshold }} flows/s)'
-    ContrailFlowsFragErrTooMany:
+        summary: "vRouter dropped flows reached the limit of {%- endraw %} {{ vrouter_flows_flow_action_drop_toohigh_threshold }}{%- raw %}/s"
+        description: "An average per-second rate of dropped vRouter flows on the {{ $labels.host }} node is {{ $value }} for at least 2 minutes."
+{%- endraw %}
+    ContrailFlowsFragErrTooHigh:
       if: >-
     {%- set vrouter_flows_frag_err_toohigh_threshold = monitoring.vrouter_flows_frag_err_too_high_threshold %}
         min(contrail_vrouter_flows_frag_err) by (host) >= {{ vrouter_flows_frag_err_toohigh_threshold }}
@@ -452,11 +362,12 @@
       for: 2m
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Too many vRouter flows with fragment errors'
-        description: 'There are too many vRouter flows with fragment errors on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_frag_err_toohigh_threshold }})'
-    ContrailFlowsInvalidNHTooMany:
+        summary: "vRouter flows with fragment errors reached the limit of {%- endraw %} {{ vrouter_flows_flow_action_drop_toohigh_threshold }}{%- raw %}"
+        description: "{{ $value }} vRouter flows on the {{ $labels.host }} node had fragment errors for at least 2 minutes."
+{%- endraw %}
+    ContrailFlowsNextHopInvalidTooHigh:
       if: >-
     {%- set vrouter_flows_invalid_nh_toohigh_threshold = monitoring.vrouter_flows_invalid_nh_too_high_threshold %}
         rate(contrail_vrouter_flows_invalid_nh[5m]) >= {{ vrouter_flows_invalid_nh_toohigh_threshold }}
@@ -464,11 +375,12 @@
       for: 2m
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Too many vRouter flows with invalid next hop'
-        description: 'There are too many vRouter flows with invalid next hop on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_invalid_nh_toohigh_threshold }})'
-    ContrailFlowsInvalidITFTooMany:
+        summary: "vRouter flows with an invalid next hop reached the limit of {%- endraw %} {{ vrouter_flows_invalid_nh_toohigh_threshold }}{%- raw %}/s"
+        description: "An average per-second rate of vRouter flows with an invalid next hop on the {{ $labels.host }} node is {{ $value }} for at least 2 minutes."
+{%- endraw %}
+    ContrailFlowsInterfaceInvalidTooHigh:
       if: >-
     {%- set vrouter_flows_composite_invalid_interface_toohigh_threshold = monitoring.vrouter_flows_composite_invalid_interface_too_high_threshold %}
         rate(contrail_vrouter_flows_composite_invalid_interface[5m]) >= {{ vrouter_flows_composite_invalid_interface_toohigh_threshold }}
@@ -476,11 +388,12 @@
       for: 2m
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Too many vRouter flows with composite invalid interface'
-        description: 'There are too many vRouter flows with composite invalid interface on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_composite_invalid_interface_toohigh_threshold }})'
-    ContrailFlowsInvalidLabelTooMany:
+        summary: "vRouter flows with an invalid composite interface reached the limit of {%- endraw %} {{ vrouter_flows_composite_invalid_interface_toohigh_threshold }}{%- raw %}/s"
+        description: "An average per-second rate of vRouter flows with an invalid composite interface on the {{ $labels.host }} node is {{ $value }} for at least 2 minutes."
+{%- endraw %}
+    ContrailFlowsLabelInvalidTooHigh:
       if: >-
     {%- set vrouter_flows_invalid_label_toohigh_threshold = monitoring.vrouter_flows_invalid_label_too_high_threshold %}
         min(contrail_vrouter_flows_invalid_label) by (host) >= {{ vrouter_flows_invalid_label_toohigh_threshold }}
@@ -488,11 +401,12 @@
       for: 2m
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Too many vRouter flows with invalid label'
-        description: 'There are too many vRouter flows with invalid label on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_invalid_label_toohigh_threshold }})'
-    ContrailFlowsQueueLimitExceededTooMany:
+        summary: "vRouter flows with an invalid label reached the limit of {%- endraw %} {{ vrouter_flows_invalid_label_toohigh_threshold }}{%- raw %}"
+        description: "{{ $value }} vRouter flows on the {{ $labels.host }} node had an invalid composite interface for at least 2 minutes."
+{%- endraw %}
+    ContrailFlowsQueueSizeExceededTooHigh:
       if: >-
     {%- set vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold = monitoring.vrouter_flows_flow_queue_limit_exceeded_too_high_threshold %}
         rate(contrail_vrouter_flows_flow_queue_limit_exceeded[5m]) >= {{ vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold }}
@@ -500,11 +414,12 @@
       for: 2m
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Too many vRouter flows with queue limit exceeded'
-        description: 'There are too many vRouter flows with queue limit exceeded on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold }})'
-    ContrailFlowsTableFullTooMany:
+        summary: "vRouter flows exceeding the queue size reached the limit of {%- endraw %} {{ vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold }}{%- raw %}/s"
+        description: "An average per-second rate of vRouter flows exceeding the queue size on the {{ $labels.host }} node is {{ $value }} for at least 2 minutes."
+{%- endraw %}
+    ContrailFlowsTableFullTooHigh:
       if: >-
     {%- set vrouter_flows_flow_table_full_toohigh_threshold = monitoring.vrouter_flows_flow_table_full_too_high_threshold %}
         min(contrail_vrouter_flows_flow_table_full) by (host) >= {{ vrouter_flows_flow_table_full_toohigh_threshold }}
@@ -512,13 +427,136 @@
       for: 2m
       labels:
         severity: warning
-        service: contrail-compute
+        service: contrail
       annotations:
-        summary: 'Too many vRouter flows with table full'
-        description: 'There are too many vRouter flows with table full on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_flow_table_full_toohigh_threshold }})'
+        summary: "vRouter flows with full table reached the limit of {%- endraw %} {{ vrouter_flows_flow_table_full_toohigh_threshold }}{%- raw %}"
+        description: "{{ $value }} vRouter flows on the {{ $labels.host }} node had a full table for at least 2 minutes."
+{%- endraw %}
     {%- endif %}
-
+    {%- if web.get('enabled', False) and web.get('cache', {}).get('engine', '') == 'redis' %}
+    {%- raw %}
+    RedisServiceDown:
+      if: >-
+        procstat_running{process_name="redis-server"} == 0
+      labels:
+        severity: minor
+        service: redis
+      annotations:
+        summary: "Redis service is down"
+        description: "The Redis service on the {{ $labels.host }} node is down."
+    {%- endraw %}
+    RedisServiceDownMinor:
+      if: >-
+        count(procstat_running{process_name="redis-server"} == 0) >= count(procstat_running{process_name="redis-server"}) *{{ monitoring.services_failed_warning_threshold_percent }}
+    {%- raw %}
+      labels:
+        severity: minor
+        service: redis
+      annotations:
+        summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of Redis services are down"
+        description: "{{ $value }} Redis services are down (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%)."
+    RedisServiceDownMajor:
+      if: >-
+        count(procstat_running{process_name="redis-server"} == 0) >= count(procstat_running{process_name="redis-server"}) *{{ monitoring.services_failed_critical_threshold_percent }}
+    {%- raw %}
+      labels:
+        severity: major
+        service: redis
+      annotations:
+        summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of Redis services are down"
+        description: "{{ $value }} Redis services are down (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})."
+    RedisServiceOutage:
+      if: >-
+        count(procstat_running{process_name="redis-server"} == 0) == count(procstat_running{process_name="redis-server"})
+      labels:
+        severity: critical
+        service: redis
+      annotations:
+        summary: "Redis service outage"
+        description: "All Redis services are down."
+    {%- endraw %}
+    {%- endif %}
     {%- if database.get('enabled', False) %}
+    {%- raw %}
+    CassandraServiceDown:
+      if: >-
+        procstat_running{process_name="cassandra-server"} == 0
+      labels:
+        severity: minor
+        service: cassandra
+      annotations:
+        summary: "Cassandra service is down"
+        description: "The Cassandra service on the {{ $labels.host }} node is down."
+    {%- endraw %}
+    CassandraServiceDownMinor:
+      if: >-
+        count(procstat_running{process_name="cassandra-server"} == 0) >= count(procstat_running{process_name="cassandra-server"}) *{{ monitoring.services_failed_warning_threshold_percent }}
+    {%- raw %}
+      labels:
+        severity: minor
+        service: cassandra
+      annotations:
+        summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of Cassandra services are down"
+        description: "{{ $value }} Cassandra services are down (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%)."
+    CassandraServiceDownMajor:
+      if: >-
+        count(procstat_running{process_name="cassandra-server"} == 0) >= count(procstat_running{process_name="cassandra-server"}) *{{ monitoring.services_failed_critical_threshold_percent }}
+    {%- raw %}
+      labels:
+        severity: major
+        service: cassandra
+      annotations:
+        summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of Cassandra services are down"
+        description: "{{ $value }} Cassandra services are down (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})."
+    CassandraServiceOutage:
+      if: >-
+        count(procstat_running{process_name="cassandra-server"} == 0) == count(procstat_running{process_name="cassandra-server"})
+      labels:
+        severity: critical
+        service: cassandra
+      annotations:
+        summary: "Cassandra service outage"
+        description: "All Cassandra services are down."
+    KafkaServiceDown:
+      if: >-
+        procstat_running{process_name="kafka-server"} == 0
+      labels:
+        severity: minor
+        service: kafka
+      annotations:
+        summary: "Kafka service is down"
+        description: "The Kafka service on the {{ $labels.host }} node is down."
+    {%- endraw %}
+    KafkaServiceDownMinor:
+      if: >-
+        count(procstat_running{process_name="kafka-server"} == 0) >= count(procstat_running{process_name="kafka-server"}) *{{ monitoring.services_failed_warning_threshold_percent }}
+      labels:
+        severity: minor
+        service: kafka
+      annotations:
+    {%- raw %}
+        summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of Kafka services are down"
+        description: "{{ $value }} Kafka services are down (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})."
+    {%- endraw %}
+    KafkaServiceDownMajor:
+      if: >-
+        count(procstat_running{process_name="kafka-server"} == 0) >= count(procstat_running{process_name="kafka-server"}) *{{ monitoring.services_failed_critical_threshold_percent }}
+    {%- raw %}
+      labels:
+        severity: major
+        service: kafka
+      annotations:
+        summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of Kafka services are down"
+        description: "{{ $value }} Kafka services are down (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})."
+    KafkaServiceOutage:
+      if: >-
+        count(procstat_running{process_name="kafka-server"} == 0) == count(procstat_running{process_name="kafka-server"})
+      labels:
+        severity: critical
+        service: kafka
+      annotations:
+        summary: "Kafka service outage"
+        description: "All Kafka services are down."
     ZookeeperServiceDown:
       if: >-
         zookeeper_up == 0
@@ -529,7 +567,7 @@
       annotations:
         summary: "Zookeeper service is down"
         description: "The Zookeeper service on the {% raw %}{{ $labels.host }}{% endraw %} node is down for at least 2 minutes."
-    ZookeeperServiceError:
+    ZookeeperServiceErrorWarning:
       if: >-
         zookeeper_service_health == 0
       for: 2m
@@ -574,6 +612,4 @@
     {%- if exporters is defined %}
       {%- include "prometheus/_exporters_config.sls" %}
     {%- endif %}
-  {%- endif %}
 {%- endif %}
-